Last active
October 25, 2023 22:13
-
-
Save ZanSara/5975901eea972c126f8e1c2341686dfb to your computer and use it in GitHub Desktop.
Office Hours - RAG Pipelines 2.0
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "gixkExYMh9cA", | |
"outputId": "4f65c6ce-272d-44d6-8ca4-739fde32a9c0" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Install haystack & some deps\n", | |
"%pip install posthog\n", | |
"%pip install langdetect\n", | |
"%pip install boilerpy3\n", | |
"%pip install haystack-ai" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "wzdNlKb2To-z" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Get OpenAI API key\n", | |
"\n", | |
"import getpass\n", | |
"\n", | |
"api_key = getpass.getpass()" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Ubo3JEG-UnNf" | |
}, | |
"source": [ | |
"## Direct query of an LLM" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "N-4JJYYai-i9" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Create the LLM component & query it\n", | |
"\n", | |
"from haystack.preview.components.generators.openai.gpt import GPTGenerator\n", | |
"\n", | |
"generator = GPTGenerator(api_key=api_key)\n", | |
"\n", | |
"result = generator.run(prompt=\"What's the capital of France?\")\n", | |
"\n", | |
"result[\"replies\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "2u_50rAUjpod" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Create a PromptBuilder\n", | |
"\n", | |
"from haystack.preview.components.builders.prompt_builder import PromptBuilder\n", | |
"\n", | |
"prompt_template = \"What's the population of {{ country }}?\"\n", | |
"prompt_builder = PromptBuilder(template = prompt_template)\n", | |
"\n", | |
"prompt_builder.run(country=\"France\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "14qTTokskoWJ" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Connect PromptBuilder and LLM\n", | |
"\n", | |
"from haystack.preview import Pipeline\n", | |
"\n", | |
"pipe = Pipeline()\n", | |
"pipe.add_component(\"prompt_builder\", prompt_builder)\n", | |
"pipe.add_component(\"llm\", generator)\n", | |
"pipe.connect(\"prompt_builder\", \"llm\")\n", | |
"\n", | |
"country = \"Germany\"\n", | |
"results = pipe.run({\"prompt_builder\": {\"country\": country}})\n", | |
"\n", | |
"results[\"llm\"][\"replies\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "NWDO5EUgVGhN" | |
}, | |
"outputs": [], | |
"source": [ | |
"pipe.draw(\"simple-llm-pipeline.png\")" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "3V5sCS1bUqzS" | |
}, | |
"source": [ | |
"## \"Ground\" the LLM replies" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "zF1hosmwFxzG" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Write some small documents to a document store\n", | |
"\n", | |
"from haystack.preview.dataclasses import Document\n", | |
"from haystack.preview.document_stores import InMemoryDocumentStore\n", | |
"\n", | |
"documents = [Document(text=\"The population of Germany is 100 million people.\"), Document(text=\"About 65 million people live in France as of today.\")]\n", | |
"\n", | |
"docstore = InMemoryDocumentStore()\n", | |
"docstore.write_documents(documents=documents)\n", | |
"\n", | |
"docstore.filter_documents()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "KvZnsKRyNH9j" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Include the documents into the prompt template\n", | |
"\n", | |
"prompt_template = \"\"\"\n", | |
" According to this document:\n", | |
"\n", | |
" \"{{ document.text }}\"\n", | |
"\n", | |
" What's the population of {{ country }}?\n", | |
"\"\"\"\n", | |
"prompt_builder = PromptBuilder(template=prompt_template)\n", | |
"\n", | |
"prompt_builder.run(document=documents[0], country=\"Germany\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "hhzctATfNH1H" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Query the pipeline by passing the country name AND the document\n", | |
"\n", | |
"from haystack.preview import Pipeline\n", | |
"\n", | |
"pipe = Pipeline()\n", | |
"pipe.add_component(\"prompt_builder\", prompt_builder)\n", | |
"pipe.add_component(\"llm\", generator)\n", | |
"pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n", | |
"\n", | |
"country = \"France\"\n", | |
"results = pipe.run({\"prompt_builder\": {\"document\": documents[0], \"country\": country}})\n", | |
"\n", | |
"results[\"llm\"][\"replies\"]" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "58LSha5lVeue" | |
}, | |
"source": [ | |
"## Retrieve the correct Documents for the PromptBuilder" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "fI8Ml68KOYwn" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Add a BM25Retriever\n", | |
"\n", | |
"from haystack.preview.components.retrievers.in_memory_bm25_retriever import InMemoryBM25Retriever\n", | |
"\n", | |
"prompt_template = \"\"\"\n", | |
" According to this documents:\n", | |
"\n", | |
" {% for doc in documents %}\n", | |
" \"{{ doc.text }}\"\n", | |
" {% endfor %}\n", | |
"\n", | |
" What's the population of {{ country }}?\n", | |
"\"\"\"\n", | |
"prompt_builder = PromptBuilder(template=prompt_template)\n", | |
"retriever = InMemoryBM25Retriever(document_store=docstore)\n", | |
"\n", | |
"pipe = Pipeline()\n", | |
"pipe.add_component(\"retriever\", retriever)\n", | |
"pipe.add_component(\"prompt_builder\", prompt_builder)\n", | |
"pipe.add_component(\"llm\", generator)\n", | |
"pipe.connect(\"retriever.documents\", \"prompt_builder.documents\")\n", | |
"pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n", | |
"\n", | |
"country = \"Germany\"\n", | |
"results = pipe.run({\n", | |
" \"retriever\": {\"query\": country},\n", | |
" \"prompt_builder\": {\"country\": country}\n", | |
" }\n", | |
")\n", | |
"\n", | |
"results[\"llm\"][\"replies\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "v4mLl5OoWHpZ" | |
}, | |
"outputs": [], | |
"source": [ | |
"pipe.draw(\"rag-pipeline.png\")" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "3m5NMSoIWREQ" | |
}, | |
"source": [ | |
"## Retrieve from the Web" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "qgPbUyhaPpNw" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Try out the web retrieval components\n", | |
"\n", | |
"from haystack.preview.components.fetchers.link_content import LinkContentFetcher\n", | |
"from haystack.preview.components.file_converters.html import HTMLToDocument\n", | |
"from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter\n", | |
"\n", | |
"fetcher = LinkContentFetcher()\n", | |
"converter = HTMLToDocument()\n", | |
"text_splitter = TextDocumentSplitter(split_by=\"sentence\", split_length=10)\n", | |
"\n", | |
"pipe = Pipeline()\n", | |
"pipe.add_component(\"fetcher\", fetcher)\n", | |
"pipe.add_component(\"converter\", converter)\n", | |
"pipe.add_component(\"text_splitter\", text_splitter)\n", | |
"pipe.connect(\"fetcher.streams\", \"converter.sources\")\n", | |
"pipe.connect(\"converter.documents\", \"text_splitter.documents\")\n", | |
"\n", | |
"pipe.draw(\"web-retriever.png\")\n", | |
"\n", | |
"pipe.run({\"fetcher\": {\"urls\": [\"https://en.wikipedia.org/wiki/Germany\"]}})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "9zgOE0vCWbu7" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Put them all together\n", | |
"from haystack.preview.components.fetchers.link_content import LinkContentFetcher\n", | |
"from haystack.preview.components.file_converters.html import HTMLToDocument\n", | |
"from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter\n", | |
"\n", | |
"fetcher = LinkContentFetcher()\n", | |
"converter = HTMLToDocument()\n", | |
"text_splitter = TextDocumentSplitter(split_by=\"sentence\", split_length=10)\n", | |
"\n", | |
"\n", | |
"prompt_template = \"\"\"\n", | |
" According to this documents:\n", | |
"\n", | |
" {% for doc in documents[:10] %}\n", | |
" \"{{ doc.text }}\"\n", | |
" {% endfor %}\n", | |
"\n", | |
" What's the population of {{ country }}?\n", | |
"\"\"\"\n", | |
"prompt_builder = PromptBuilder(template=prompt_template)\n", | |
"\n", | |
"\n", | |
"pipe = Pipeline()\n", | |
"pipe.add_component(\"fetcher\", fetcher)\n", | |
"pipe.add_component(\"converter\", converter)\n", | |
"pipe.add_component(\"text_splitter\", text_splitter)\n", | |
"pipe.add_component(\"prompt_builder\", prompt_builder)\n", | |
"pipe.add_component(\"llm\", generator)\n", | |
"pipe.connect(\"fetcher.streams\", \"converter.sources\")\n", | |
"pipe.connect(\"converter.documents\", \"text_splitter.documents\")\n", | |
"pipe.connect(\"text_splitter.documents\", \"prompt_builder.documents\")\n", | |
"pipe.connect(\"prompt_builder.prompt\", \"llm.prompt\")\n", | |
"\n", | |
"pipe.draw(\"web-retrieval-rag.png\")\n", | |
"\n", | |
"country = \"Spain\"\n", | |
"pipe.run({\"prompt_builder\": {\"country\": country}, \"fetcher\": {\"urls\": [f\"https://en.wikipedia.org/wiki/{country}\"]}})\n" | |
] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"name": "python3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment