Skip to content

Instantly share code, notes, and snippets.

@ZanSara
Last active January 18, 2024 09:58
Show Gist options
  • Save ZanSara/ba7efd241c61ccfd12ed48195e23bb34 to your computer and use it in GitHub Desktop.
Save ZanSara/ba7efd241c61ccfd12ed48195e23bb34 to your computer and use it in GitHub Desktop.
Haystack 2.0 - Indexing data for RAG applications
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"da9b359b99bd4a55aa5de6909001ace6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_9ae5621b2ec14e42b86ea022fc9a9036",
"IPY_MODEL_9c0c6de69c4e4cf0bb80069f8ccbbc17",
"IPY_MODEL_39b1de62c96e40359ce346af3d503345"
],
"layout": "IPY_MODEL_9199fa051abe4e47853507620b4c6ad8"
}
},
"9ae5621b2ec14e42b86ea022fc9a9036": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_37b920c754ca45fa8bb6f7deffda6b35",
"placeholder": "​",
"style": "IPY_MODEL_ea6f38b17f8448c4b1acbcf3f13cfac4",
"value": "Ranking by BM25...: 100%"
}
},
"9c0c6de69c4e4cf0bb80069f8ccbbc17": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_6f18c152c08b48d48c60777cf6887c74",
"max": 137,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_b4bde5d5bbd3467793ca389192b9bb20",
"value": 137
}
},
"39b1de62c96e40359ce346af3d503345": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_2c3630a3c26c4f7a912a4062ae2914b1",
"placeholder": "​",
"style": "IPY_MODEL_c2554f609d5a4228bcfe2cc795969ee5",
"value": " 137/137 [00:00<00:00, 4335.90 docs/s]"
}
},
"9199fa051abe4e47853507620b4c6ad8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"37b920c754ca45fa8bb6f7deffda6b35": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ea6f38b17f8448c4b1acbcf3f13cfac4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"6f18c152c08b48d48c60777cf6887c74": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b4bde5d5bbd3467793ca389192b9bb20": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"2c3630a3c26c4f7a912a4062ae2914b1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"c2554f609d5a4228bcfe2cc795969ee5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"*From https://www.zansara.dev/posts/2023-11-05-haystack-series-minimal-indexing*"
],
"metadata": {
"id": "BP1v_BapAGrJ"
}
},
{
"cell_type": "markdown",
"source": [
"# Install the libraries"
],
"metadata": {
"id": "n1DbBwkqX-ax"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "gixkExYMh9cA",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "24f6b5c2-dbc0-495f-d444-ee6361673d8c"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting openai\n",
" Downloading openai-1.8.0-py3-none-any.whl (222 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.3/222.3 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai) (3.7.1)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai) (1.7.0)\n",
"Collecting httpx<1,>=0.23.0 (from openai)\n",
" Downloading httpx-0.26.0-py3-none-any.whl (75 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai) (1.10.13)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai) (1.3.0)\n",
"Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai) (4.66.1)\n",
"Collecting typing-extensions<5,>=4.7 (from openai)\n",
" Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n",
"Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (3.6)\n",
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (1.2.0)\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai) (2023.11.17)\n",
"Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)\n",
" Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)\n",
" Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: typing-extensions, h11, httpcore, httpx, openai\n",
" Attempting uninstall: typing-extensions\n",
" Found existing installation: typing_extensions 4.5.0\n",
" Uninstalling typing_extensions-4.5.0:\n",
" Successfully uninstalled typing_extensions-4.5.0\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"llmx 0.0.15a0 requires cohere, which is not installed.\n",
"llmx 0.0.15a0 requires tiktoken, which is not installed.\n",
"tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
"\u001b[0mSuccessfully installed h11-0.14.0 httpcore-1.0.2 httpx-0.26.0 openai-1.8.0 typing-extensions-4.9.0\n",
"Collecting boilerpy3\n",
" Downloading boilerpy3-1.0.7-py3-none-any.whl (22 kB)\n",
"Installing collected packages: boilerpy3\n",
"Successfully installed boilerpy3-1.0.7\n",
"Collecting haystack-ai\n",
" Downloading haystack_ai-2.0.0b5-py3-none-any.whl (233 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.5/233.5 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: boilerpy3 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.0.7)\n",
"Collecting haystack-bm25 (from haystack-ai)\n",
" Downloading haystack_bm25-1.0.2-py2.py3-none-any.whl (8.8 kB)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.1.3)\n",
"Collecting lazy-imports (from haystack-ai)\n",
" Downloading lazy_imports-0.3.1-py3-none-any.whl (12 kB)\n",
"Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (10.1.0)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.2.1)\n",
"Requirement already satisfied: openai>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.8.0)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.5.3)\n",
"Collecting posthog (from haystack-ai)\n",
" Downloading posthog-3.3.1-py2.py3-none-any.whl (40 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.6/40.6 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (6.0.1)\n",
"Requirement already satisfied: tenacity in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (8.2.3)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.66.1)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.9.0)\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (3.7.1)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.1.0->haystack-ai) (1.7.0)\n",
"Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (0.26.0)\n",
"Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (1.10.13)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (1.3.0)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from haystack-bm25->haystack-ai) (1.23.5)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->haystack-ai) (2.1.3)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2023.3.post1)\n",
"Requirement already satisfied: requests<3.0,>=2.7 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (2.31.0)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (1.16.0)\n",
"Collecting monotonic>=1.5 (from posthog->haystack-ai)\n",
" Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n",
"Collecting backoff>=1.10.0 (from posthog->haystack-ai)\n",
" Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
"Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai>=1.1.0->haystack-ai) (3.6)\n",
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai>=1.1.0->haystack-ai) (1.2.0)\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (2023.11.17)\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (1.0.2)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (0.14.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.7->posthog->haystack-ai) (3.3.2)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.7->posthog->haystack-ai) (2.0.7)\n",
"Installing collected packages: monotonic, lazy-imports, haystack-bm25, backoff, posthog, haystack-ai\n",
"Successfully installed backoff-2.2.1 haystack-ai-2.0.0b5 haystack-bm25-1.0.2 lazy-imports-0.3.1 monotonic-1.6 posthog-3.3.1\n"
]
}
],
"source": [
"# Install haystack & some deps\n",
"%pip install openai\n",
"%pip install boilerpy3\n",
"%pip install haystack-ai==2.0.0b5"
]
},
{
"cell_type": "code",
"source": [
"# Get OpenAI API key\n",
"\n",
"import getpass\n",
"\n",
"api_key = getpass.getpass()"
],
"metadata": {
"id": "wzdNlKb2To-z",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "3b209a7a-1e19-4c96-8635-bbcadbb07ffa"
},
"execution_count": 2,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"··········\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Convert HTML files\n",
"\n"
],
"metadata": {
"id": "2Se0WERxZuc6"
}
},
{
"cell_type": "code",
"source": [
"# Download a few web pages about Rose Island\n",
"!curl -o Republic_of_Rose_Island-Wikipedia.html https://en.wikipedia.org/wiki/Republic_of_Rose_Island\n",
"!curl -o Isola_delle_Rose-Wikipedia.html https://it.wikipedia.org/wiki/Isola_delle_Rose\n",
"!curl -o Rose_Island_BBC.html https://www.bbc.com/news/entertainment-arts-55092341\n",
"!curl -o Stamping_a_Nations_identity_Rose_island.html https://www.stanleygibbons.com/collecting-stamps/dispatches/stamping-nations-identity-rose-island\n",
"!curl -o La_Vera_Storia_dell_Isola_Delle_Rose.html https://www.ilpost.it/2020/12/09/isola-delle-rose-storia-vera/\n",
"!curl -o Il_Mistero_Dell_Isola_Delle_Rose.html https://web.archive.org/web/20200527140504/https://riminisparita.it/mistero-isola-delle-rose-analisi-intervista-rosa-delehaye/\n",
"\n",
"file_names = [\n",
" \"Republic_of_Rose_Island-Wikipedia.html\",\n",
" \"Isola_delle_Rose-Wikipedia.html\",\n",
" \"Rose_Island_BBC.html\",\n",
" \"Stamping_a_Nations_identity_Rose_island.html\",\n",
" \"La_Vera_Storia_dell_Isola_Delle_Rose.html\",\n",
" \"Il_Mistero_Dell_Isola_Delle_Rose.html\",\n",
"]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5QXu-2XFEKAH",
"outputId": "a4d3bab9-a08d-4b22-e7c5-eac3a1e2cd34"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 120k 100 120k 0 0 1238k 0 --:--:-- --:--:-- --:--:-- 1247k\n",
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 154k 100 154k 0 0 1586k 0 --:--:-- --:--:-- --:--:-- 1591k\n",
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 249k 100 249k 0 0 448k 0 --:--:-- --:--:-- --:--:-- 448k\n",
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 120k 0 120k 0 0 92014 0 --:--:-- 0:00:01 --:--:-- 92027\n",
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 69001 100 69001 0 0 195k 0 --:--:-- --:--:-- --:--:-- 195k\n",
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 221k 100 221k 0 0 229k 0 --:--:-- --:--:-- --:--:-- 229k\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from haystack.components.converters import HTMLToDocument\n",
"\n",
"converter = HTMLToDocument()\n",
"documents = converter.run(sources=file_names)[\"documents\"]\n",
"\n",
"print(documents[0].content)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aJqjTxrhZyn-",
"outputId": "5d31f501-47bf-4d87-e0ec-a3c7689d007b"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Republic of Rose Island\n",
"1968 short-lived micronation on a man-made platform in the Adriatic Sea\n",
"Republic of Rose Island\n",
"Repubblica dell'Isola delle Rose ( Italian )\n",
"Status\n",
"Location\n",
"Purported currency\n",
"milo (used in stamps; no coins or notes are known to have been printed)\n",
"Formation\n",
"Republic of Rose Island\n",
"The Republic of Rose Island ( Esperanto : Respubliko de la Insulo de la Rozoj; Italian : Repubblica dell'Isola delle Rose) was a short-lived micronation on a man-made platform in the Adriatic Sea , 11 kilometres (6.8 mi) off the coast of the province of Rimini , Italy, built by Italian engineer Giorgio Rosa, who made himself its president and declared it an independent state on 1 May 1968. [1] [2] Rose Island had its own government, currency, post office, and commercial establishments, and the official language was Esperanto . [1]\n",
"However, it was never formally recognized as a sovereign state by any country of the world. The Italian government viewed it as a ploy by Rosa to raise money from tourists while avoiding national taxation, Rose Island was occupied by the Italian police forces on 26 June 1968, subject to naval blockade, and eventually demolished in February 1969. [3] [4]\n",
"It is believed that the Esperanto term Rozoj (in Italian: rose) was borrowed from the surname of Giorgio Rosa, the designer and builder of the artificial platform, as well as the creator and inspirer of the state entity, as well as from his desire to \"see roses bloom on the sea\". [5]\n",
"In 1958, Italian engineer Giorgio Rosa funded the construction of a 400-square-metre (4,300 sq ft) platform supported by nine pylons and furnished it with a number of commercial establishments, including a restaurant, bar, nightclub, souvenir shop, and post office, with construction being completed in 1967. [6]\n",
"The platform declared independence on 1 May 1968, under the Esperanto name Insulo de la Rozoj, with Rosa as self-declared president. Rose Island issued a number of stamps, including one showing its approximate location in the Adriatic Sea . The purported currency of the republic was the mill , and this appeared on early stamp issues, although no coins or banknotes are known to have been produced. [7]\n",
"Rosa's actions were viewed by the Italian government as a ploy to raise money from tourists while avoiding national taxation. Whether or not this was the real reason behind Rosa's micronation, the Italian government's response was swift: On 26 June 1968, 55 days after the island declared independence, the Italian navy sent a group of four carabinieri and Guardia di Finanza officers, who assumed control, cleared the island, and set up a blockade so no one could re-enter. [7] [8]\n",
"At first, the Italian government tried to dismantle the island, but they found it impossible, so they decided to blow it up instead. The Italian Navy bombed the island twice, with the first time failing, and the second bombing taking place on 13 February 1969, [9] but the island still stood. Afterward, Rosa's self-declared government in exile created stamps depicting the events. Rosa was billed by the Italian government for war costs. Finally, on 26 February 1969, the island was toppled by a storm. Only one death was counted but never confirmed: apparently, Rosa's dog was on the platform during the facility's detonation. [8]\n",
"Since the first decade of the 2000s, Rose Island's history has been the subject of documentary research and rediscoveries, based on the utopian aspect of its genesis. [11]\n",
"In popular culture[ edit ]\n",
"Rose Island is featured in the Italian comic book Martin Mystère , n. 193. [12]\n",
"Rose Island , a 2020 film based on the story of the micronation, directed by Sydney Sibilia , was released on Netflix on 8 December 2020. [13]\n",
"REM Island , a platform towed into international waters for the purposes of offshore radio broadcasting.\n",
"Sealand , a declared principality near the United Kingdom, built on a World War II sea fort.\n",
"Vaccarezza, Fabio (January 2007). \"Rose Island: A Dream of Freedom\". The Cinderella Philatelist: 42–46. ISSN 0009-6911 .\n",
"Strauss, Erwin S. (1984). How to Start Your Own Country (2nd ed.). Port Townsend, WA: Breakout Productions. pp. 129–130. ISBN\n",
"Menefee, Samuel Pyeatt (Fall 1994). \"'Republics of the Reefs': Nation-Building on the Continental Shelf and in the World's Oceans\". California Western International Law Journal. 25 (1): 105–06. ISSN 0886-3210 .\n",
"Italian-language website - discusses the history of Rose Island and its postage stamps. Includes a scan of part of a contemporary newspaper article.[ dead link ]\n",
"(including pictures of its destruction) and comments from the daughter of one of the people responsible for the destruction (Italian language)\n",
"\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Clean the text"
],
"metadata": {
"id": "8ZfJBk9qibQ8"
}
},
{
"cell_type": "code",
"source": [
"from haystack.components.preprocessors import DocumentCleaner\n",
"\n",
"cleaner = DocumentCleaner()\n",
"clean_documents = cleaner.run(documents)[\"documents\"]\n",
"clean_documents[0]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gZhoqTVWicuj",
"outputId": "52511b7e-5fe8-457a-b536-4d8529243b71"
},
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Document(id=0850aa1bbbbca32db2dec790a12ee90153c3575306c6005ccd3fc9c9434c45ba, content: 'Republic of Rose Island\n",
"1968 short-lived micronation on a man-made platform in the Adriatic Sea\n",
"Repu...', meta: {'file_path': 'Republic_of_Rose_Island-Wikipedia.html'})"
]
},
"metadata": {},
"execution_count": 6
}
]
},
{
"cell_type": "markdown",
"source": [
"# Split the text"
],
"metadata": {
"id": "2rTg54MDafWW"
}
},
{
"cell_type": "code",
"source": [
"from haystack.components.preprocessors import DocumentSplitter\n",
"\n",
"text_splitter = DocumentSplitter(split_by=\"sentence\", split_length=5)\n",
"split_documents = text_splitter.run(documents=clean_documents)[\"documents\"]\n",
"split_documents[0]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GdMqf5M8akzc",
"outputId": "3b9b4907-f25c-4c8d-e992-c486443eeeda"
},
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Document(id=4bf81c9af7580f1afbafc933919b2611e2595deaa1dddbce1ecc59273b97d5e1, content: 'Republic of Rose Island\n",
"1968 short-lived micronation on a man-made platform in the Adriatic Sea\n",
"Repu...', meta: {'file_path': 'Republic_of_Rose_Island-Wikipedia.html', 'source_id': '0850aa1bbbbca32db2dec790a12ee90153c3575306c6005ccd3fc9c9434c45ba'})"
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"source": [
"len(split_documents)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "q4mUWImckAPJ",
"outputId": "a84d641d-9451-4223-9c4d-cdb3eb5141aa"
},
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"137"
]
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "markdown",
"source": [
"# Write the documents in the store"
],
"metadata": {
"id": "hNt9sZCSllrb"
}
},
{
"cell_type": "code",
"source": [
"from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
"\n",
"document_store = InMemoryDocumentStore()"
],
"metadata": {
"id": "JHAye51XlyBk"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from haystack.components.writers import DocumentWriter\n",
"\n",
"writer = DocumentWriter(document_store=document_store)\n",
"writer.run(documents=split_documents)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5sD_y13YloDW",
"outputId": "54eb075a-81c4-4a82-f97c-bf1eb69f8c56"
},
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'documents_written': 137}"
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"source": [
"document_store.filter_documents()[0]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6udDEyDsmB4L",
"outputId": "9fa68b50-f083-451c-bc4e-632a3d4246f1"
},
"execution_count": 13,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Document(id=4bf81c9af7580f1afbafc933919b2611e2595deaa1dddbce1ecc59273b97d5e1, content: 'Republic of Rose Island\n",
"1968 short-lived micronation on a man-made platform in the Adriatic Sea\n",
"Repu...', meta: {'file_path': 'Republic_of_Rose_Island-Wikipedia.html', 'source_id': '0850aa1bbbbca32db2dec790a12ee90153c3575306c6005ccd3fc9c9434c45ba'})"
]
},
"metadata": {},
"execution_count": 13
}
]
},
{
"cell_type": "markdown",
"source": [
"# The Indexing Pipeline"
],
"metadata": {
"id": "HkomqVBVmPSO"
}
},
{
"cell_type": "code",
"source": [
"from haystack import Pipeline\n",
"\n",
"document_store = InMemoryDocumentStore()\n",
"\n",
"pipeline = Pipeline()\n",
"pipeline.add_component(\"converter\", HTMLToDocument())\n",
"pipeline.add_component(\"cleaner\", DocumentCleaner())\n",
"pipeline.add_component(\"splitter\", DocumentSplitter(split_by=\"sentence\", split_length=5))\n",
"pipeline.add_component(\"writer\", DocumentWriter(document_store=document_store))\n",
"pipeline.connect(\"converter\", \"cleaner\")\n",
"pipeline.connect(\"cleaner\", \"splitter\")\n",
"pipeline.connect(\"splitter\", \"writer\")\n",
"\n",
"pipeline.draw(\"minimal-indexing-pipeline.png\")\n",
"\n",
"pipeline.run({\"converter\": {\"sources\": file_names}})\n",
"\n",
"document_store.filter_documents()[0]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "lGlAB98KmRV5",
"outputId": "5dcbe3c8-c55f-49a2-9cab-2f35d39781c8"
},
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Document(id=4bf81c9af7580f1afbafc933919b2611e2595deaa1dddbce1ecc59273b97d5e1, content: 'Republic of Rose Island\n",
"1968 short-lived micronation on a man-made platform in the Adriatic Sea\n",
"Repu...', meta: {'file_path': 'Republic_of_Rose_Island-Wikipedia.html', 'source_id': '0850aa1bbbbca32db2dec790a12ee90153c3575306c6005ccd3fc9c9434c45ba'})"
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"source": [
"# Let's use it for RAG!"
],
"metadata": {
"id": "a3AQE_Iffbuq"
}
},
{
"cell_type": "code",
"source": [
"from haystack.components.generators import OpenAIGenerator\n",
"from haystack.components.builders.prompt_builder import PromptBuilder\n",
"from haystack.components.retrievers.in_memory import InMemoryBM25Retriever\n",
"\n",
"template = \"\"\"\n",
"Given the following information, answer the question: {{ question }}\n",
"\n",
"{% for document in documents %}\n",
" {{ document.content }}\n",
"{% endfor %}\n",
"\"\"\"\n",
"pipe = Pipeline()\n",
"\n",
"pipe.add_component(\"retriever\", InMemoryBM25Retriever(document_store=document_store))\n",
"pipe.add_component(\"prompt_builder\", PromptBuilder(template=template))\n",
"pipe.add_component(\"llm\", OpenAIGenerator(api_key=api_key))\n",
"pipe.connect(\"retriever\", \"prompt_builder.documents\")\n",
"pipe.connect(\"prompt_builder\", \"llm\")\n",
"\n",
"question = \"Is there any documentary about the story of Rose Island? Can you tell me something about that?\"\n",
"pipe.run({\n",
" \"retriever\": {\"query\": question},\n",
" \"prompt_builder\": {\"question\": question}\n",
"})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 240,
"referenced_widgets": [
"da9b359b99bd4a55aa5de6909001ace6",
"9ae5621b2ec14e42b86ea022fc9a9036",
"9c0c6de69c4e4cf0bb80069f8ccbbc17",
"39b1de62c96e40359ce346af3d503345",
"9199fa051abe4e47853507620b4c6ad8",
"37b920c754ca45fa8bb6f7deffda6b35",
"ea6f38b17f8448c4b1acbcf3f13cfac4",
"6f18c152c08b48d48c60777cf6887c74",
"b4bde5d5bbd3467793ca389192b9bb20",
"2c3630a3c26c4f7a912a4062ae2914b1",
"c2554f609d5a4228bcfe2cc795969ee5"
]
},
"id": "iy6ZXhBSfeSG",
"outputId": "d9bbc258-ff0d-48d5-aa31-f8b6daaf8538"
},
"execution_count": 16,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"Ranking by BM25...: 0%| | 0/137 [00:00<?, ? docs/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "da9b359b99bd4a55aa5de6909001ace6"
}
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['Yes, there is a documentary about the story of Rose Island. The documentary is called \"Rose Island\" and it has been made into a film by Netflix. It follows the true story of Giorgio Rosa and his battle with the Italian authorities for his self-made structure to be recognized as an independent state. Giorgio Rosa built his own island in the Adriatic sea in the late 1960s, which housed various facilities. The Italian government tried to shut it down and discredit it, but it only made the island more popular. Giorgio Rosa\\'s son shares that his father was upset and suffered when the island was destroyed. Despite the challenges, the film was able to be made with the support of Netflix, allowing the story to be shared with a wider audience.'],\n",
" 'meta': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'completion_tokens': 157,\n",
" 'prompt_tokens': 1544,\n",
" 'total_tokens': 1701}}]}}"
]
},
"metadata": {},
"execution_count": 16
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment