ZanSara/RAG_Pipelines.ipynb

## RAG_Pipelines.ipynb
{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gixkExYMh9cA",
        "outputId": "4f65c6ce-272d-44d6-8ca4-739fde32a9c0"
      },
      "outputs": [],
      "source": [
        "# Install haystack & some deps\n",
        "%pip install posthog\n",
        "%pip install langdetect\n",
        "%pip install boilerpy3\n",
        "%pip install haystack-ai"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wzdNlKb2To-z"
      },
      "outputs": [],
      "source": [
        "# Get OpenAI API key\n",
        "\n",
        "import getpass\n",
        "\n",
        "api_key = getpass.getpass()"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "Ubo3JEG-UnNf"
      },
      "source": [
        "## Direct query of an LLM"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "N-4JJYYai-i9"
      },
      "outputs": [],
      "source": [
        "# Create the LLM component & query it\n",
        "\n",
        "from haystack.preview.components.generators.openai.gpt import GPTGenerator\n",
        "\n",
        "generator = GPTGenerator(api_key=api_key)\n",
        "\n",
        "result = generator.run(prompt=\"What's the capital of France?\")\n",
        "\n",
        "result[\"replies\"]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2u_50rAUjpod"
      },
      "outputs": [],
      "source": [
        "# Create a PromptBuilder\n",
        "\n",
        "from haystack.preview.components.builders.prompt_builder import PromptBuilder\n",
        "\n",
        "prompt_template = \"What's the population of {{ country }}?\"\n",
        "prompt_builder = PromptBuilder(template = prompt_template)\n",
        "\n",
        "prompt_builder.run(country=\"France\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "14qTTokskoWJ"
      },
      "outputs": [],
      "source": [
        "# Connect PromptBuilder and LLM\n",
        "\n",
        "from haystack.preview import Pipeline\n",
        "\n",
        "pipe = Pipeline()\n",
        "pipe.add_component(\"prompt_builder\", prompt_builder)\n",
        "pipe.add_component(\"llm\", generator)\n",
        "pipe.connect(\"prompt_builder\", \"llm\")\n",
        "\n",
        "country = \"Germany\"\n",
        "results = pipe.run({\"prompt_builder\": {\"country\": country}})\n",
        "\n",
        "results[\"llm\"][\"replies\"]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NWDO5EUgVGhN"
      },
      "outputs": [],
      "source": [
        "pipe.draw(\"simple-llm-pipeline.png\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "3V5sCS1bUqzS"
      },
      "source": [
        "## \"Ground\" the LLM replies"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zF1hosmwFxzG"
      },
      "outputs": [],
      "source": [
        "# Write some small documents to a document store\n",
        "\n",
        "from haystack.preview.dataclasses import Document\n",
        "from haystack.preview.document_stores import InMemoryDocumentStore\n",
        "\n",
        "documents = [Document(text=\"The population of Germany is 100 million people.\"), Document(text=\"About 65 million people live in France as of today.\")]\n",
        "\n",
        "docstore = InMemoryDocumentStore()\n",
        "docstore.write_documents(documents=documents)\n",
        "\n",
        "docstore.filter_documents()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KvZnsKRyNH9j"
      },
      "outputs": [],
      "source": [
        "# Include the documents into the prompt template\n",
        "\n",
        "prompt_template = \"\"\"\n",
        "  According to this document:\n",
        "\n",
        "  \"{{ document.text }}\"\n",
        "\n",
        "  What's the population of {{ country }}?\n",
        "\"\"\"\n",
        "prompt_builder = PromptBuilder(template=prompt_template)\n",
        "\n",
        "prompt_builder.run(document=documents[0], country=\"Germany\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hhzctATfNH1H"
      },
      "outputs": [],
      "source": [
        "# Query the pipeline by passing the country name AND the document\n",
        "\n",
        "from haystack.preview import Pipeline\n",
        "\n",
        "pipe = Pipeline()\n",
        "pipe.add_component(\"prompt_builder\", prompt_builder)\n",
        "pipe.add_component(\"llm\", generator)\n",
        "pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n",
        "\n",
        "country = \"France\"\n",
        "results = pipe.run({\"prompt_builder\": {\"document\": documents[0], \"country\": country}})\n",
        "\n",
        "results[\"llm\"][\"replies\"]"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "58LSha5lVeue"
      },
      "source": [
        "## Retrieve the correct Documents for the PromptBuilder"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fI8Ml68KOYwn"
      },
      "outputs": [],
      "source": [
        "# Add a BM25Retriever\n",
        "\n",
        "from haystack.preview.components.retrievers.in_memory_bm25_retriever import InMemoryBM25Retriever\n",
        "\n",
        "prompt_template = \"\"\"\n",
        "  According to this documents:\n",
        "\n",
        "  {% for doc in documents %}\n",
        "  \"{{ doc.text }}\"\n",
        "  {% endfor %}\n",
        "\n",
        "  What's the population of {{ country }}?\n",
        "\"\"\"\n",
        "prompt_builder = PromptBuilder(template=prompt_template)\n",
        "retriever = InMemoryBM25Retriever(document_store=docstore)\n",
        "\n",
        "pipe = Pipeline()\n",
        "pipe.add_component(\"retriever\", retriever)\n",
        "pipe.add_component(\"prompt_builder\", prompt_builder)\n",
        "pipe.add_component(\"llm\", generator)\n",
        "pipe.connect(\"retriever.documents\", \"prompt_builder.documents\")\n",
        "pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n",
        "\n",
        "country = \"Germany\"\n",
        "results = pipe.run({\n",
        "    \"retriever\": {\"query\": country},\n",
        "    \"prompt_builder\": {\"country\": country}\n",
        "  }\n",
        ")\n",
        "\n",
        "results[\"llm\"][\"replies\"]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "v4mLl5OoWHpZ"
      },
      "outputs": [],
      "source": [
        "pipe.draw(\"rag-pipeline.png\")"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "3m5NMSoIWREQ"
      },
      "source": [
        "## Retrieve from the Web"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qgPbUyhaPpNw"
      },
      "outputs": [],
      "source": [
        "# Try out the web retrieval components\n",
        "\n",
        "from haystack.preview.components.fetchers.link_content import LinkContentFetcher\n",
        "from haystack.preview.components.file_converters.html import HTMLToDocument\n",
        "from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter\n",
        "\n",
        "fetcher = LinkContentFetcher()\n",
        "converter = HTMLToDocument()\n",
        "text_splitter = TextDocumentSplitter(split_by=\"sentence\", split_length=10)\n",
        "\n",
        "pipe = Pipeline()\n",
        "pipe.add_component(\"fetcher\", fetcher)\n",
        "pipe.add_component(\"converter\", converter)\n",
        "pipe.add_component(\"text_splitter\", text_splitter)\n",
        "pipe.connect(\"fetcher.streams\", \"converter.sources\")\n",
        "pipe.connect(\"converter.documents\", \"text_splitter.documents\")\n",
        "\n",
        "pipe.draw(\"web-retriever.png\")\n",
        "\n",
        "pipe.run({\"fetcher\": {\"urls\": [\"https://en.wikipedia.org/wiki/Germany\"]}})"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9zgOE0vCWbu7"
      },
      "outputs": [],
      "source": [
        "# Put them all together\n",
        "from haystack.preview.components.fetchers.link_content import LinkContentFetcher\n",
        "from haystack.preview.components.file_converters.html import HTMLToDocument\n",
        "from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter\n",
        "\n",
        "fetcher = LinkContentFetcher()\n",
        "converter = HTMLToDocument()\n",
        "text_splitter = TextDocumentSplitter(split_by=\"sentence\", split_length=10)\n",
        "\n",
        "\n",
        "prompt_template = \"\"\"\n",
        "  According to this documents:\n",
        "\n",
        "  {% for doc in documents[:10] %}\n",
        "  \"{{ doc.text }}\"\n",
        "  {% endfor %}\n",
        "\n",
        "  What's the population of {{ country }}?\n",
        "\"\"\"\n",
        "prompt_builder = PromptBuilder(template=prompt_template)\n",
        "\n",
        "\n",
        "pipe = Pipeline()\n",
        "pipe.add_component(\"fetcher\", fetcher)\n",
        "pipe.add_component(\"converter\", converter)\n",
        "pipe.add_component(\"text_splitter\", text_splitter)\n",
        "pipe.add_component(\"prompt_builder\", prompt_builder)\n",
        "pipe.add_component(\"llm\", generator)\n",
        "pipe.connect(\"fetcher.streams\", \"converter.sources\")\n",
        "pipe.connect(\"converter.documents\", \"text_splitter.documents\")\n",
        "pipe.connect(\"text_splitter.documents\", \"prompt_builder.documents\")\n",
        "pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n",
        "\n",
        "pipe.draw(\"web-retrieval-rag.png\")\n",
        "\n",
        "country = \"Spain\"\n",
        "pipe.run({\"prompt_builder\": {\"country\": country}, \"fetcher\": {\"urls\": [f\"https://en.wikipedia.org/wiki/{country}\"]}})\n"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "gixkExYMh9cA",
	"outputId": "4f65c6ce-272d-44d6-8ca4-739fde32a9c0"
	},
	"outputs": [],
	"source": [
	"# Install haystack & some deps\n",
	"%pip install posthog\n",
	"%pip install langdetect\n",
	"%pip install boilerpy3\n",
	"%pip install haystack-ai"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "wzdNlKb2To-z"
	},
	"outputs": [],
	"source": [
	"# Get OpenAI API key\n",
	"\n",
	"import getpass\n",
	"\n",
	"api_key = getpass.getpass()"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {
	"id": "Ubo3JEG-UnNf"
	},
	"source": [
	"## Direct query of an LLM"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "N-4JJYYai-i9"
	},
	"outputs": [],
	"source": [
	"# Create the LLM component & query it\n",
	"\n",
	"from haystack.preview.components.generators.openai.gpt import GPTGenerator\n",
	"\n",
	"generator = GPTGenerator(api_key=api_key)\n",
	"\n",
	"result = generator.run(prompt=\"What's the capital of France?\")\n",
	"\n",
	"result[\"replies\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "2u_50rAUjpod"
	},
	"outputs": [],
	"source": [
	"# Create a PromptBuilder\n",
	"\n",
	"from haystack.preview.components.builders.prompt_builder import PromptBuilder\n",
	"\n",
	"prompt_template = \"What's the population of {{ country }}?\"\n",
	"prompt_builder = PromptBuilder(template = prompt_template)\n",
	"\n",
	"prompt_builder.run(country=\"France\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "14qTTokskoWJ"
	},
	"outputs": [],
	"source": [
	"# Connect PromptBuilder and LLM\n",
	"\n",
	"from haystack.preview import Pipeline\n",
	"\n",
	"pipe = Pipeline()\n",
	"pipe.add_component(\"prompt_builder\", prompt_builder)\n",
	"pipe.add_component(\"llm\", generator)\n",
	"pipe.connect(\"prompt_builder\", \"llm\")\n",
	"\n",
	"country = \"Germany\"\n",
	"results = pipe.run({\"prompt_builder\": {\"country\": country}})\n",
	"\n",
	"results[\"llm\"][\"replies\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "NWDO5EUgVGhN"
	},
	"outputs": [],
	"source": [
	"pipe.draw(\"simple-llm-pipeline.png\")"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {
	"id": "3V5sCS1bUqzS"
	},
	"source": [
	"## \"Ground\" the LLM replies"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "zF1hosmwFxzG"
	},
	"outputs": [],
	"source": [
	"# Write some small documents to a document store\n",
	"\n",
	"from haystack.preview.dataclasses import Document\n",
	"from haystack.preview.document_stores import InMemoryDocumentStore\n",
	"\n",
	"documents = [Document(text=\"The population of Germany is 100 million people.\"), Document(text=\"About 65 million people live in France as of today.\")]\n",
	"\n",
	"docstore = InMemoryDocumentStore()\n",
	"docstore.write_documents(documents=documents)\n",
	"\n",
	"docstore.filter_documents()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "KvZnsKRyNH9j"
	},
	"outputs": [],
	"source": [
	"# Include the documents into the prompt template\n",
	"\n",
	"prompt_template = \"\"\"\n",
	" According to this document:\n",
	"\n",
	" \"{{ document.text }}\"\n",
	"\n",
	" What's the population of {{ country }}?\n",
	"\"\"\"\n",
	"prompt_builder = PromptBuilder(template=prompt_template)\n",
	"\n",
	"prompt_builder.run(document=documents[0], country=\"Germany\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "hhzctATfNH1H"
	},
	"outputs": [],
	"source": [
	"# Query the pipeline by passing the country name AND the document\n",
	"\n",
	"from haystack.preview import Pipeline\n",
	"\n",
	"pipe = Pipeline()\n",
	"pipe.add_component(\"prompt_builder\", prompt_builder)\n",
	"pipe.add_component(\"llm\", generator)\n",
	"pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n",
	"\n",
	"country = \"France\"\n",
	"results = pipe.run({\"prompt_builder\": {\"document\": documents[0], \"country\": country}})\n",
	"\n",
	"results[\"llm\"][\"replies\"]"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {
	"id": "58LSha5lVeue"
	},
	"source": [
	"## Retrieve the correct Documents for the PromptBuilder"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "fI8Ml68KOYwn"
	},
	"outputs": [],
	"source": [
	"# Add a BM25Retriever\n",
	"\n",
	"from haystack.preview.components.retrievers.in_memory_bm25_retriever import InMemoryBM25Retriever\n",
	"\n",
	"prompt_template = \"\"\"\n",
	" According to this documents:\n",
	"\n",
	" {% for doc in documents %}\n",
	" \"{{ doc.text }}\"\n",
	" {% endfor %}\n",
	"\n",
	" What's the population of {{ country }}?\n",
	"\"\"\"\n",
	"prompt_builder = PromptBuilder(template=prompt_template)\n",
	"retriever = InMemoryBM25Retriever(document_store=docstore)\n",
	"\n",
	"pipe = Pipeline()\n",
	"pipe.add_component(\"retriever\", retriever)\n",
	"pipe.add_component(\"prompt_builder\", prompt_builder)\n",
	"pipe.add_component(\"llm\", generator)\n",
	"pipe.connect(\"retriever.documents\", \"prompt_builder.documents\")\n",
	"pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n",
	"\n",
	"country = \"Germany\"\n",
	"results = pipe.run({\n",
	" \"retriever\": {\"query\": country},\n",
	" \"prompt_builder\": {\"country\": country}\n",
	" }\n",
	")\n",
	"\n",
	"results[\"llm\"][\"replies\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "v4mLl5OoWHpZ"
	},
	"outputs": [],
	"source": [
	"pipe.draw(\"rag-pipeline.png\")"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {
	"id": "3m5NMSoIWREQ"
	},
	"source": [
	"## Retrieve from the Web"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "qgPbUyhaPpNw"
	},
	"outputs": [],
	"source": [
	"# Try out the web retrieval components\n",
	"\n",
	"from haystack.preview.components.fetchers.link_content import LinkContentFetcher\n",
	"from haystack.preview.components.file_converters.html import HTMLToDocument\n",
	"from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter\n",
	"\n",
	"fetcher = LinkContentFetcher()\n",
	"converter = HTMLToDocument()\n",
	"text_splitter = TextDocumentSplitter(split_by=\"sentence\", split_length=10)\n",
	"\n",
	"pipe = Pipeline()\n",
	"pipe.add_component(\"fetcher\", fetcher)\n",
	"pipe.add_component(\"converter\", converter)\n",
	"pipe.add_component(\"text_splitter\", text_splitter)\n",
	"pipe.connect(\"fetcher.streams\", \"converter.sources\")\n",
	"pipe.connect(\"converter.documents\", \"text_splitter.documents\")\n",
	"\n",
	"pipe.draw(\"web-retriever.png\")\n",
	"\n",
	"pipe.run({\"fetcher\": {\"urls\": [\"https://en.wikipedia.org/wiki/Germany\"]}})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "9zgOE0vCWbu7"
	},
	"outputs": [],
	"source": [
	"# Put them all together\n",
	"from haystack.preview.components.fetchers.link_content import LinkContentFetcher\n",
	"from haystack.preview.components.file_converters.html import HTMLToDocument\n",
	"from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter\n",
	"\n",
	"fetcher = LinkContentFetcher()\n",
	"converter = HTMLToDocument()\n",
	"text_splitter = TextDocumentSplitter(split_by=\"sentence\", split_length=10)\n",
	"\n",
	"\n",
	"prompt_template = \"\"\"\n",
	" According to this documents:\n",
	"\n",
	" {% for doc in documents[:10] %}\n",
	" \"{{ doc.text }}\"\n",
	" {% endfor %}\n",
	"\n",
	" What's the population of {{ country }}?\n",
	"\"\"\"\n",
	"prompt_builder = PromptBuilder(template=prompt_template)\n",
	"\n",
	"\n",
	"pipe = Pipeline()\n",
	"pipe.add_component(\"fetcher\", fetcher)\n",
	"pipe.add_component(\"converter\", converter)\n",
	"pipe.add_component(\"text_splitter\", text_splitter)\n",
	"pipe.add_component(\"prompt_builder\", prompt_builder)\n",
	"pipe.add_component(\"llm\", generator)\n",
	"pipe.connect(\"fetcher.streams\", \"converter.sources\")\n",
	"pipe.connect(\"converter.documents\", \"text_splitter.documents\")\n",
	"pipe.connect(\"text_splitter.documents\", \"prompt_builder.documents\")\n",
	"pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n",
	"\n",
	"pipe.draw(\"web-retrieval-rag.png\")\n",
	"\n",
	"country = \"Spain\"\n",
	"pipe.run({\"prompt_builder\": {\"country\": country}, \"fetcher\": {\"urls\": [f\"https://en.wikipedia.org/wiki/{country}\"]}})\n"
	]
	}
	],
	"metadata": {
	"colab": {
	"provenance": []
	},
	"kernelspec": {
	"display_name": "Python 3",
	"name": "python3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}