Skip to content

Instantly share code, notes, and snippets.

@ZanSara
Last active October 25, 2023 22:13
Show Gist options
  • Save ZanSara/5975901eea972c126f8e1c2341686dfb to your computer and use it in GitHub Desktop.
Save ZanSara/5975901eea972c126f8e1c2341686dfb to your computer and use it in GitHub Desktop.
Office Hours - RAG Pipelines 2.0
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gixkExYMh9cA",
"outputId": "4f65c6ce-272d-44d6-8ca4-739fde32a9c0"
},
"outputs": [],
"source": [
"# Install haystack & some deps\n",
"%pip install posthog\n",
"%pip install langdetect\n",
"%pip install boilerpy3\n",
"%pip install haystack-ai"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wzdNlKb2To-z"
},
"outputs": [],
"source": [
"# Get OpenAI API key\n",
"\n",
"import getpass\n",
"\n",
"api_key = getpass.getpass()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "Ubo3JEG-UnNf"
},
"source": [
"## Direct query of an LLM"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "N-4JJYYai-i9"
},
"outputs": [],
"source": [
"# Create the LLM component & query it\n",
"\n",
"from haystack.preview.components.generators.openai.gpt import GPTGenerator\n",
"\n",
"generator = GPTGenerator(api_key=api_key)\n",
"\n",
"result = generator.run(prompt=\"What's the capital of France?\")\n",
"\n",
"result[\"replies\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2u_50rAUjpod"
},
"outputs": [],
"source": [
"# Create a PromptBuilder\n",
"\n",
"from haystack.preview.components.builders.prompt_builder import PromptBuilder\n",
"\n",
"prompt_template = \"What's the population of {{ country }}?\"\n",
"prompt_builder = PromptBuilder(template = prompt_template)\n",
"\n",
"prompt_builder.run(country=\"France\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "14qTTokskoWJ"
},
"outputs": [],
"source": [
"# Connect PromptBuilder and LLM\n",
"\n",
"from haystack.preview import Pipeline\n",
"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"prompt_builder\", prompt_builder)\n",
"pipe.add_component(\"llm\", generator)\n",
"pipe.connect(\"prompt_builder\", \"llm\")\n",
"\n",
"country = \"Germany\"\n",
"results = pipe.run({\"prompt_builder\": {\"country\": country}})\n",
"\n",
"results[\"llm\"][\"replies\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NWDO5EUgVGhN"
},
"outputs": [],
"source": [
"pipe.draw(\"simple-llm-pipeline.png\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "3V5sCS1bUqzS"
},
"source": [
"## \"Ground\" the LLM replies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "zF1hosmwFxzG"
},
"outputs": [],
"source": [
"# Write some small documents to a document store\n",
"\n",
"from haystack.preview.dataclasses import Document\n",
"from haystack.preview.document_stores import InMemoryDocumentStore\n",
"\n",
"documents = [Document(text=\"The population of Germany is 100 million people.\"), Document(text=\"About 65 million people live in France as of today.\")]\n",
"\n",
"docstore = InMemoryDocumentStore()\n",
"docstore.write_documents(documents=documents)\n",
"\n",
"docstore.filter_documents()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KvZnsKRyNH9j"
},
"outputs": [],
"source": [
"# Include the documents into the prompt template\n",
"\n",
"prompt_template = \"\"\"\n",
" According to this document:\n",
"\n",
" \"{{ document.text }}\"\n",
"\n",
" What's the population of {{ country }}?\n",
"\"\"\"\n",
"prompt_builder = PromptBuilder(template=prompt_template)\n",
"\n",
"prompt_builder.run(document=documents[0], country=\"Germany\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "hhzctATfNH1H"
},
"outputs": [],
"source": [
"# Query the pipeline by passing the country name AND the document\n",
"\n",
"from haystack.preview import Pipeline\n",
"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"prompt_builder\", prompt_builder)\n",
"pipe.add_component(\"llm\", generator)\n",
"pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n",
"\n",
"country = \"France\"\n",
"results = pipe.run({\"prompt_builder\": {\"document\": documents[0], \"country\": country}})\n",
"\n",
"results[\"llm\"][\"replies\"]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "58LSha5lVeue"
},
"source": [
"## Retrieve the correct Documents for the PromptBuilder"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "fI8Ml68KOYwn"
},
"outputs": [],
"source": [
"# Add a BM25Retriever\n",
"\n",
"from haystack.preview.components.retrievers.in_memory_bm25_retriever import InMemoryBM25Retriever\n",
"\n",
"prompt_template = \"\"\"\n",
" According to this documents:\n",
"\n",
" {% for doc in documents %}\n",
" \"{{ doc.text }}\"\n",
" {% endfor %}\n",
"\n",
" What's the population of {{ country }}?\n",
"\"\"\"\n",
"prompt_builder = PromptBuilder(template=prompt_template)\n",
"retriever = InMemoryBM25Retriever(document_store=docstore)\n",
"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"retriever\", retriever)\n",
"pipe.add_component(\"prompt_builder\", prompt_builder)\n",
"pipe.add_component(\"llm\", generator)\n",
"pipe.connect(\"retriever.documents\", \"prompt_builder.documents\")\n",
"pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n",
"\n",
"country = \"Germany\"\n",
"results = pipe.run({\n",
" \"retriever\": {\"query\": country},\n",
" \"prompt_builder\": {\"country\": country}\n",
" }\n",
")\n",
"\n",
"results[\"llm\"][\"replies\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "v4mLl5OoWHpZ"
},
"outputs": [],
"source": [
"pipe.draw(\"rag-pipeline.png\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "3m5NMSoIWREQ"
},
"source": [
"## Retrieve from the Web"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qgPbUyhaPpNw"
},
"outputs": [],
"source": [
"# Try out the web retrieval components\n",
"\n",
"from haystack.preview.components.fetchers.link_content import LinkContentFetcher\n",
"from haystack.preview.components.file_converters.html import HTMLToDocument\n",
"from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter\n",
"\n",
"fetcher = LinkContentFetcher()\n",
"converter = HTMLToDocument()\n",
"text_splitter = TextDocumentSplitter(split_by=\"sentence\", split_length=10)\n",
"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"fetcher\", fetcher)\n",
"pipe.add_component(\"converter\", converter)\n",
"pipe.add_component(\"text_splitter\", text_splitter)\n",
"pipe.connect(\"fetcher.streams\", \"converter.sources\")\n",
"pipe.connect(\"converter.documents\", \"text_splitter.documents\")\n",
"\n",
"pipe.draw(\"web-retriever.png\")\n",
"\n",
"pipe.run({\"fetcher\": {\"urls\": [\"https://en.wikipedia.org/wiki/Germany\"]}})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9zgOE0vCWbu7"
},
"outputs": [],
"source": [
"# Put them all together\n",
"from haystack.preview.components.fetchers.link_content import LinkContentFetcher\n",
"from haystack.preview.components.file_converters.html import HTMLToDocument\n",
"from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter\n",
"\n",
"fetcher = LinkContentFetcher()\n",
"converter = HTMLToDocument()\n",
"text_splitter = TextDocumentSplitter(split_by=\"sentence\", split_length=10)\n",
"\n",
"\n",
"prompt_template = \"\"\"\n",
" According to this documents:\n",
"\n",
" {% for doc in documents[:10] %}\n",
" \"{{ doc.text }}\"\n",
" {% endfor %}\n",
"\n",
" What's the population of {{ country }}?\n",
"\"\"\"\n",
"prompt_builder = PromptBuilder(template=prompt_template)\n",
"\n",
"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"fetcher\", fetcher)\n",
"pipe.add_component(\"converter\", converter)\n",
"pipe.add_component(\"text_splitter\", text_splitter)\n",
"pipe.add_component(\"prompt_builder\", prompt_builder)\n",
"pipe.add_component(\"llm\", generator)\n",
"pipe.connect(\"fetcher.streams\", \"converter.sources\")\n",
"pipe.connect(\"converter.documents\", \"text_splitter.documents\")\n",
"pipe.connect(\"text_splitter.documents\", \"prompt_builder.documents\")\n",
"pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n",
"\n",
"pipe.draw(\"web-retrieval-rag.png\")\n",
"\n",
"country = \"Spain\"\n",
"pipe.run({\"prompt_builder\": {\"country\": country}, \"fetcher\": {\"urls\": [f\"https://en.wikipedia.org/wiki/{country}\"]}})\n"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment