lgs/bibliography_analysis.ipynb

## bibliography_analysis.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import textwrap\n",
    "from pathlib import Path\n",
    "import shutil\n",
    "\n",
    "from dotenv import find_dotenv, load_dotenv, dotenv_values\n",
    "\n",
    "import openai\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "\n",
    "\n",
    "from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceEmbeddings\n",
    "from sentence_transformers import SentenceTransformer\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.cluster import KMeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = dotenv_values(\"../.env\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = HuggingFaceEmbeddings(model_name='thenlper/gte-base')\n",
    "model = SentenceTransformer('thenlper/gte-base')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_bib_file(bib_content):\n",
    "    \"\"\"Parse the content of a single .bib file and return a list of articles.\"\"\"\n",
    "    # Split the content by \"@article\" to get individual articles\n",
    "    articles = re.split(r'(?=@article)', bib_content)\n",
    "        \n",
    "    article_list = []\n",
    "        \n",
    "    for article in articles:\n",
    "        if not article.strip():\n",
    "            continue\n",
    "            \n",
    "        # Extract the unique identifier for the article\n",
    "        match = re.search(r'@article\\{ (?P<id>.*?),', article)\n",
    "        if not match:\n",
    "            continue\n",
    "        article_id = match.group('id').strip()\n",
    "            \n",
    "        # Extract the fields within the article entry\n",
    "        fields = re.findall(r'(\\w+) = \\{(.*?)\\},', article, re.DOTALL)\n",
    "        article_dict = {'ID': article_id}\n",
    "            \n",
    "        for field, value in fields:\n",
    "            article_dict[field] = value.strip().replace('\\n', ' ')\n",
    "            \n",
    "        article_list.append(article_dict)\n",
    "        \n",
    "    return article_list\n",
    "\n",
    "def extract_articles_from_folder(folder_path, article_type=None):\n",
    "    \"\"\"Extract articles from multiple .bib files in the provided folder and optionally filter by article type.\"\"\"\n",
    "    all_articles = []\n",
    "        \n",
    "    # Iterate over all files in the folder\n",
    "    for filename in os.listdir(folder_path):\n",
    "        if filename.endswith(\".bib\"):\n",
    "            with open(os.path.join(folder_path, filename), \"r\") as file:\n",
    "                bib_content = file.read()\n",
    "                articles = parse_bib_file(bib_content)\n",
    "                all_articles.extend(articles)\n",
    "        \n",
    "    # Optionally filter by article type\n",
    "    if article_type:\n",
    "        all_articles = [article for article in all_articles if article.get('Type') == article_type]\n",
    "        \n",
    "    return all_articles\n",
    "\n",
    "def parse_arbitrary_text(text, chunk_size=1000, chunk_overlap=200, field=None):\n",
    "    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
    "    chunked_text = text_splitter.create_documents([text])\n",
    "    return chunked_text\n",
    "\n",
    "def get_list_of_chunks(articles_dict_list):\n",
    "    articles_filtered = list(filter(lambda x: 'Abstract' in x.keys(), articles_dict_list))\n",
    "    chunks_abstracts = list(map(lambda x: parse_arbitrary_text(x['Abstract']), articles_filtered))\n",
    "    return list(map(lambda x: list(map(lambda y: y.page_content, x)), chunks_abstracts))\n",
    "\n",
    "def get_abstract_embedding_with_mapping(bibliography_folder_path, article_type=None):\n",
    "    articles_abstracts = extract_articles_from_folder(bibliography_folder_path, article_type=article_type)\n",
    "    articles_chunked_abstracts = get_list_of_chunks(articles_abstracts)\n",
    "    chunks_abstracts_embeddings = list(map(lambda x: model.encode(x), articles_chunked_abstracts))\n",
    "    flattened_chunks_abstracts_embeddings = np.array([vec for sublist in chunks_abstracts_embeddings for vec in sublist])\n",
    "    # Create a mapping of numpy array row index to original list index\n",
    "    index_mapping = [(i, j) for i, sublist in enumerate(chunks_abstracts_embeddings) for j, _ in enumerate(sublist)]\n",
    "    return articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping\n",
    "\n",
    "def get_cluster_representation(articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping, n_clusters):\n",
    "    # Run KMeans clustering\n",
    "    kmeans = KMeans(n_clusters=n_clusters, n_init=10)\n",
    "    kmeans.fit(flattened_chunks_abstracts_embeddings)\n",
    "    centroids = kmeans.cluster_centers_\n",
    "    # Find the indices of the closest elements in the flattened_array for each centroid\n",
    "    closest_point_indices = [np.argmin(np.linalg.norm(flattened_chunks_abstracts_embeddings - centroid, axis=1)) for centroid in centroids]\n",
    "    closest_chunks = []\n",
    "    for closest_point_index in closest_point_indices:\n",
    "        # print(index_mapping[closest_point_index])\n",
    "        index_abstract, index_embedding = index_mapping[closest_point_index]\n",
    "        closest_chunks.append(articles_chunked_abstracts[index_abstract][index_embedding])\n",
    "    return closest_chunks, centroids\n",
    "\n",
    "def perform_cluster_analysis(bibliography_folder_path, n_clusters=3):\n",
    "    articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping = get_abstract_embedding_with_mapping(bibliography_folder_path)\n",
    "    closest_chunks, centroids = get_cluster_representation(articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping, n_clusters)\n",
    "    return closest_chunks, centroids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping = get_abstract_embedding_with_mapping('bibliography_analyzer/bibliography')\n",
    "closest_chunks, centroids = get_cluster_representation(articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['enough to highlight    potential weed problems and solutions when comparing alternative crop    rotations for a Field. The model has been incorporated into a decision    Support tool for whole-farm management so growers in the Central Great    Plains of the United States can compare alternative crop rotations and    how their choice influences farm income, herbicide use, and control of    weeds in their fields.',\n",
       " 'of SOC. The soils were also physically fractionated by    density (light fraction, LF) and size (sand, silt and clay). With    improved management, the SOC storage in the clay size fraction showed    the largest increase across all fractions. This increase was greater for    MS than MM. The NTMS treatment resulted in a decline in silt-OC storage    compared to CTMM. The SOC mineralization (mg CO2-C g(-1)soil) was    affected by tillage and driven by LF-OC and was observed in the order:    NTMM (2.06) > MPMM (1.72) approximate to NTMS (1.71) > CTMM (1.52)    approximate to MPMS (1.41). Both cropping and depth affected the    biodegradability of SOC. Considering the plough layer (0-20 cm),    treatments under MM had a larger proportion of biodegradable SOC than    those under MS. We conclude that the significant differences in SOC    storage in physical fractions and SOC biodegradation were caused by    differences in soil management. Highlights    Clay size fraction showed the largest',\n",
       " 'considered the    risks of yield losses and/or lower harvest quality plus harvesting    difficulties. In the medium term, they anticipated the risk of finding a    weed species in another crop of the rotation where control would be    difficult or costly, weighing the risks of yield loss against the cost    and effectiveness of solutions, not only in the current crop but also in    subsequent crops, so that once again, the rotation was the central focus    of weed control. In the long term, their main aim was to limit the soil    seed bank to an acceptable level. The farmers interviewed stated that    they would continue to implement a weed control programme that they    deemed satisfactory as long as no new problem appeared, and until they    could learn about more effective technical solutions. When designing a    DSS that will ensure successful, more sustainable weed management    practices, it is crucial to take account of both the complexity of the    decision-making process and',\n",
       " '0-20 cm) during October-November, before planting. The rotations    significantly influenced mean SOM level, the order being fallow    (lowest), continuous wheat, lentil, chickpea, vetch, and medic    (highest). The mean effect of N was to increase SOM, but grazing    intensity tended to decrease SOM. While results from different aspects    of the trial published elsewhere demonstrated the value of legume-based    rotations as biologically and economically viable alternatives to fallow    or continuous cropping, this soil sampling SOM study showed that crop    production can be compatible with the goal of improving soil quality,    with potential environmental benefits. Thus, soil and crop management    practices involving appropriate rotations (legumes/cereals), adequate N    fertilization of the cereal crop, and retention of crop residues can    combine sustainable and economic cropping while reversing soil    degradation.']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "closest_chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "openai.api_key = config[\"OPENAI_API_KEY\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "response = openai.ChatCompletion.create(\n",
    "  model=\"gpt-4\",\n",
    "  messages=[\n",
    "    {\n",
    "      \"role\": \"system\",\n",
    "      \"content\": \"the provided list represents the text chunks that are closest to the centroids; the centroids are the representations of different concepts from the field of agriculture, specifically, crom rotation. For every text chunk, determine the concept that the centroid represents and name it\\n\"\n",
    "    },\n",
    "    {\n",
    "      \"role\": \"user\",\n",
    "      \"content\": \"The chunks for which the concepts are to be determined are:\\n\" + '\\n'.join(closest_chunks) + \"\\n\"\n",
    "    }\n",
    "  ],\n",
    "  temperature=0,\n",
    "  max_tokens=256,\n",
    "  top_p=1,\n",
    "  frequency_penalty=0,\n",
    "  presence_penalty=0\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1. Weed Management in Crop Rotation\n",
      "2. Soil Organic Carbon (SOC) Management\n",
      "3. Weed Control Strategy in Crop Rotation\n",
      "4. Soil Organic Matter (SOM) in Crop Rotation\n"
     ]
    }
   ],
   "source": [
    "print(response['choices'][0]['message']['content'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# References\n",
    "\n",
    "1. [Data (.bib files)](https://github.com/nikogamulin/data/blob/main/bibliography.tar.gz)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "autogen",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import re\n",
	"import textwrap\n",
	"from pathlib import Path\n",
	"import shutil\n",
	"\n",
	"from dotenv import find_dotenv, load_dotenv, dotenv_values\n",
	"\n",
	"import openai\n",
	"from langchain.chat_models import ChatOpenAI\n",
	"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
	"\n",
	"\n",
	"from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceEmbeddings\n",
	"from sentence_transformers import SentenceTransformer\n",
	"import numpy as np\n",
	"\n",
	"from sklearn.cluster import KMeans"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"config = dotenv_values(\"../.env\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"embeddings = HuggingFaceEmbeddings(model_name='thenlper/gte-base')\n",
	"model = SentenceTransformer('thenlper/gte-base')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"def parse_bib_file(bib_content):\n",
	" \"\"\"Parse the content of a single .bib file and return a list of articles.\"\"\"\n",
	" # Split the content by \"@article\" to get individual articles\n",
	" articles = re.split(r'(?=@article)', bib_content)\n",
	" \n",
	" article_list = []\n",
	" \n",
	" for article in articles:\n",
	" if not article.strip():\n",
	" continue\n",
	" \n",
	" # Extract the unique identifier for the article\n",
	" match = re.search(r'@article\\{ (?P<id>.*?),', article)\n",
	" if not match:\n",
	" continue\n",
	" article_id = match.group('id').strip()\n",
	" \n",
	" # Extract the fields within the article entry\n",
	" fields = re.findall(r'(\\w+) = \\{(.*?)\\},', article, re.DOTALL)\n",
	" article_dict = {'ID': article_id}\n",
	" \n",
	" for field, value in fields:\n",
	" article_dict[field] = value.strip().replace('\\n', ' ')\n",
	" \n",
	" article_list.append(article_dict)\n",
	" \n",
	" return article_list\n",
	"\n",
	"def extract_articles_from_folder(folder_path, article_type=None):\n",
	" \"\"\"Extract articles from multiple .bib files in the provided folder and optionally filter by article type.\"\"\"\n",
	" all_articles = []\n",
	" \n",
	" # Iterate over all files in the folder\n",
	" for filename in os.listdir(folder_path):\n",
	" if filename.endswith(\".bib\"):\n",
	" with open(os.path.join(folder_path, filename), \"r\") as file:\n",
	" bib_content = file.read()\n",
	" articles = parse_bib_file(bib_content)\n",
	" all_articles.extend(articles)\n",
	" \n",
	" # Optionally filter by article type\n",
	" if article_type:\n",
	" all_articles = [article for article in all_articles if article.get('Type') == article_type]\n",
	" \n",
	" return all_articles\n",
	"\n",
	"def parse_arbitrary_text(text, chunk_size=1000, chunk_overlap=200, field=None):\n",
	" text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
	" chunked_text = text_splitter.create_documents([text])\n",
	" return chunked_text\n",
	"\n",
	"def get_list_of_chunks(articles_dict_list):\n",
	" articles_filtered = list(filter(lambda x: 'Abstract' in x.keys(), articles_dict_list))\n",
	" chunks_abstracts = list(map(lambda x: parse_arbitrary_text(x['Abstract']), articles_filtered))\n",
	" return list(map(lambda x: list(map(lambda y: y.page_content, x)), chunks_abstracts))\n",
	"\n",
	"def get_abstract_embedding_with_mapping(bibliography_folder_path, article_type=None):\n",
	" articles_abstracts = extract_articles_from_folder(bibliography_folder_path, article_type=article_type)\n",
	" articles_chunked_abstracts = get_list_of_chunks(articles_abstracts)\n",
	" chunks_abstracts_embeddings = list(map(lambda x: model.encode(x), articles_chunked_abstracts))\n",
	" flattened_chunks_abstracts_embeddings = np.array([vec for sublist in chunks_abstracts_embeddings for vec in sublist])\n",
	" # Create a mapping of numpy array row index to original list index\n",
	" index_mapping = [(i, j) for i, sublist in enumerate(chunks_abstracts_embeddings) for j, _ in enumerate(sublist)]\n",
	" return articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping\n",
	"\n",
	"def get_cluster_representation(articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping, n_clusters):\n",
	" # Run KMeans clustering\n",
	" kmeans = KMeans(n_clusters=n_clusters, n_init=10)\n",
	" kmeans.fit(flattened_chunks_abstracts_embeddings)\n",
	" centroids = kmeans.cluster_centers_\n",
	" # Find the indices of the closest elements in the flattened_array for each centroid\n",
	" closest_point_indices = [np.argmin(np.linalg.norm(flattened_chunks_abstracts_embeddings - centroid, axis=1)) for centroid in centroids]\n",
	" closest_chunks = []\n",
	" for closest_point_index in closest_point_indices:\n",
	" # print(index_mapping[closest_point_index])\n",
	" index_abstract, index_embedding = index_mapping[closest_point_index]\n",
	" closest_chunks.append(articles_chunked_abstracts[index_abstract][index_embedding])\n",
	" return closest_chunks, centroids\n",
	"\n",
	"def perform_cluster_analysis(bibliography_folder_path, n_clusters=3):\n",
	" articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping = get_abstract_embedding_with_mapping(bibliography_folder_path)\n",
	" closest_chunks, centroids = get_cluster_representation(articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping, n_clusters)\n",
	" return closest_chunks, centroids"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping = get_abstract_embedding_with_mapping('bibliography_analyzer/bibliography')\n",
	"closest_chunks, centroids = get_cluster_representation(articles_chunked_abstracts, flattened_chunks_abstracts_embeddings, index_mapping, 4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['enough to highlight potential weed problems and solutions when comparing alternative crop rotations for a Field. The model has been incorporated into a decision Support tool for whole-farm management so growers in the Central Great Plains of the United States can compare alternative crop rotations and how their choice influences farm income, herbicide use, and control of weeds in their fields.',\n",
	" 'of SOC. The soils were also physically fractionated by density (light fraction, LF) and size (sand, silt and clay). With improved management, the SOC storage in the clay size fraction showed the largest increase across all fractions. This increase was greater for MS than MM. The NTMS treatment resulted in a decline in silt-OC storage compared to CTMM. The SOC mineralization (mg CO2-C g(-1)soil) was affected by tillage and driven by LF-OC and was observed in the order: NTMM (2.06) > MPMM (1.72) approximate to NTMS (1.71) > CTMM (1.52) approximate to MPMS (1.41). Both cropping and depth affected the biodegradability of SOC. Considering the plough layer (0-20 cm), treatments under MM had a larger proportion of biodegradable SOC than those under MS. We conclude that the significant differences in SOC storage in physical fractions and SOC biodegradation were caused by differences in soil management. Highlights Clay size fraction showed the largest',\n",
	" 'considered the risks of yield losses and/or lower harvest quality plus harvesting difficulties. In the medium term, they anticipated the risk of finding a weed species in another crop of the rotation where control would be difficult or costly, weighing the risks of yield loss against the cost and effectiveness of solutions, not only in the current crop but also in subsequent crops, so that once again, the rotation was the central focus of weed control. In the long term, their main aim was to limit the soil seed bank to an acceptable level. The farmers interviewed stated that they would continue to implement a weed control programme that they deemed satisfactory as long as no new problem appeared, and until they could learn about more effective technical solutions. When designing a DSS that will ensure successful, more sustainable weed management practices, it is crucial to take account of both the complexity of the decision-making process and',\n",
	" '0-20 cm) during October-November, before planting. The rotations significantly influenced mean SOM level, the order being fallow (lowest), continuous wheat, lentil, chickpea, vetch, and medic (highest). The mean effect of N was to increase SOM, but grazing intensity tended to decrease SOM. While results from different aspects of the trial published elsewhere demonstrated the value of legume-based rotations as biologically and economically viable alternatives to fallow or continuous cropping, this soil sampling SOM study showed that crop production can be compatible with the goal of improving soil quality, with potential environmental benefits. Thus, soil and crop management practices involving appropriate rotations (legumes/cereals), adequate N fertilization of the cereal crop, and retention of crop residues can combine sustainable and economic cropping while reversing soil degradation.']"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"closest_chunks"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"openai.api_key = config[\"OPENAI_API_KEY\"]\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": [
	"response = openai.ChatCompletion.create(\n",
	" model=\"gpt-4\",\n",
	" messages=[\n",
	" {\n",
	" \"role\": \"system\",\n",
	" \"content\": \"the provided list represents the text chunks that are closest to the centroids; the centroids are the representations of different concepts from the field of agriculture, specifically, crom rotation. For every text chunk, determine the concept that the centroid represents and name it\\n\"\n",
	" },\n",
	" {\n",
	" \"role\": \"user\",\n",
	" \"content\": \"The chunks for which the concepts are to be determined are:\\n\" + '\\n'.join(closest_chunks) + \"\\n\"\n",
	" }\n",
	" ],\n",
	" temperature=0,\n",
	" max_tokens=256,\n",
	" top_p=1,\n",
	" frequency_penalty=0,\n",
	" presence_penalty=0\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1. Weed Management in Crop Rotation\n",
	"2. Soil Organic Carbon (SOC) Management\n",
	"3. Weed Control Strategy in Crop Rotation\n",
	"4. Soil Organic Matter (SOM) in Crop Rotation\n"
	]
	}
	],
	"source": [
	"print(response['choices'][0]['message']['content'])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# References\n",
	"\n",
	"1. [Data (.bib files)](https://github.com/nikogamulin/data/blob/main/bibliography.tar.gz)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "autogen",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}