Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ZanSara/6075d418c1494e780f7098db32bc6cf6 to your computer and use it in GitHub Desktop.
Save ZanSara/6075d418c1494e780f7098db32bc6cf6 to your computer and use it in GitHub Desktop.
DataHour_Optimizing_LLMs_with_Retrieval_Augmented_Generation_and_Haystack_2_0.ipynb
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"37a5fefd489a4ba1b187680e88e94c5d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_04f605683a7145ae9c198c1aff60c8f3",
"IPY_MODEL_479dd63e42fd443ca9565a902a64c865",
"IPY_MODEL_d75b5856839d484ca75a73b1176f7d73"
],
"layout": "IPY_MODEL_6a1db87bd49b4e6d9164250647315ee7"
}
},
"04f605683a7145ae9c198c1aff60c8f3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_d485501d1efd46ae93e9efca5adc445b",
"placeholder": "​",
"style": "IPY_MODEL_bcec6b2cef3947828bb2298ecf9f1ed9",
"value": "Ranking by BM25...: 100%"
}
},
"479dd63e42fd443ca9565a902a64c865": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b7012a41560840a69edfc6fb65e4f5ac",
"max": 5,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_ad4992cf76514fd4b5f9217b72cd985d",
"value": 5
}
},
"d75b5856839d484ca75a73b1176f7d73": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_a4c633e1e57b4f51b4ed9e5c1b2014e5",
"placeholder": "​",
"style": "IPY_MODEL_3638dacfe22e43198435c6b12a7aa4f5",
"value": " 5/5 [00:00<00:00, 91.26 docs/s]"
}
},
"6a1db87bd49b4e6d9164250647315ee7": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d485501d1efd46ae93e9efca5adc445b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"bcec6b2cef3947828bb2298ecf9f1ed9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"b7012a41560840a69edfc6fb65e4f5ac": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ad4992cf76514fd4b5f9217b72cd985d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"a4c633e1e57b4f51b4ed9e5c1b2014e5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"3638dacfe22e43198435c6b12a7aa4f5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"eb9ab61725f6485c903c4e87517ca82b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_9a00817ab8914a13ae88c059af40241e",
"IPY_MODEL_3fbf5f12c119403ca10a3c84890257e7",
"IPY_MODEL_c179eb52519c4d1f98b3c568563bf445"
],
"layout": "IPY_MODEL_432b2eeaf477483499d9cf81809c19c8"
}
},
"9a00817ab8914a13ae88c059af40241e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_88cbc9c8281944ec96b02399dec7a338",
"placeholder": "​",
"style": "IPY_MODEL_b59da026371a416184923bd9dc150bc0",
"value": "Ranking by BM25...: 100%"
}
},
"3fbf5f12c119403ca10a3c84890257e7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_863f0a81c65244518620dc24717f8487",
"max": 5,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_860c2a326e2d4792ac680ed8142bca9c",
"value": 5
}
},
"c179eb52519c4d1f98b3c568563bf445": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_2332b3910fd2455595bfaee467b297a1",
"placeholder": "​",
"style": "IPY_MODEL_f92f3ab1793240aaa0a774af1c02cd8b",
"value": " 5/5 [00:00<00:00, 190.49 docs/s]"
}
},
"432b2eeaf477483499d9cf81809c19c8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"88cbc9c8281944ec96b02399dec7a338": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b59da026371a416184923bd9dc150bc0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"863f0a81c65244518620dc24717f8487": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"860c2a326e2d4792ac680ed8142bca9c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"2332b3910fd2455595bfaee467b297a1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"f92f3ab1793240aaa0a774af1c02cd8b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ZanSara/6075d418c1494e780f7098db32bc6cf6/datahour_optimizing_llms_with_retrieval_augmented_generation_and_haystack_2_0.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# Optimizing LLMs with Retrieval Augmented Generation and Haystack 2.0\n",
"\n",
"_by Sara Zanzottera: [LinkedIn](https://www.linkedin.com/in/sarazanzottera), [Twitter](https://twitter.com/zansara_dev)._\n",
"\n",
"_This colab notebook was presented at the DataHour workshop titled \"Optimizing LLMs with Retrieval Augmented Generation and Haystack 2.0\"_\n",
"\n",
"---\n",
"\n",
"In this notebook we're going to:\n",
"- Install Haystack\n",
"- Send a query to gpt-3.5-turbo\n",
"- Build a simple generative pipeline\n",
"- Enhance the context “by hand” with a better prompt\n",
"- Create a Document Store and add the context to it\n",
"- Build a RAG pipeline\n",
"- Adapt the pipeline to search the Web instead of the Document Store\n",
"\n",
"At the end, we move to another Colab to see how to build a custom component.\n"
],
"metadata": {
"id": "Yt8anOOuIFO7"
}
},
{
"cell_type": "markdown",
"source": [
"# Install Haystack"
],
"metadata": {
"id": "n1DbBwkqX-ax"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "gixkExYMh9cA",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "cd9f1198-4295-4415-e907-9aebd2360e28"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: haystack-ai in /usr/local/lib/python3.10/dist-packages (2.0.0b2)\n",
"Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.35.2)\n",
"Requirement already satisfied: boilerpy3 in /usr/local/lib/python3.10/dist-packages (1.0.7)\n",
"Requirement already satisfied: sentence_transformers in /usr/local/lib/python3.10/dist-packages (2.2.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.1.2)\n",
"Requirement already satisfied: lazy-imports in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (0.3.1)\n",
"Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (10.1.0)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.2.1)\n",
"Requirement already satisfied: openai<1.0.0 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (0.28.1)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.5.3)\n",
"Requirement already satisfied: posthog in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.1.0)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (6.0.1)\n",
"Requirement already satisfied: rank-bm25 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (0.2.2)\n",
"Requirement already satisfied: tenacity in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (8.2.3)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.66.1)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.5.0)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.4)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
"Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.0)\n",
"Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.1)\n",
"Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (2.1.0+cu121)\n",
"Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (0.16.0+cu121)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (1.2.2)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (1.11.4)\n",
"Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (3.8.1)\n",
"Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (0.1.99)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from openai<1.0.0->haystack-ai) (3.9.1)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.11.17)\n",
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence_transformers) (1.12)\n",
"Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence_transformers) (2.1.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->haystack-ai) (2.1.3)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->sentence_transformers) (8.1.7)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence_transformers) (1.3.2)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2023.3.post1)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (1.16.0)\n",
"Requirement already satisfied: monotonic>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (1.6)\n",
"Requirement already satisfied: backoff>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (2.2.1)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence_transformers) (3.2.0)\n",
"Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->sentence_transformers) (9.4.0)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<1.0.0->haystack-ai) (23.1.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<1.0.0->haystack-ai) (6.0.4)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<1.0.0->haystack-ai) (1.9.4)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<1.0.0->haystack-ai) (1.4.0)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<1.0.0->haystack-ai) (1.3.1)\n",
"Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<1.0.0->haystack-ai) (4.0.3)\n",
"Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->sentence_transformers) (1.3.0)\n"
]
}
],
"source": [
"%pip install haystack-ai transformers boilerpy3 sentence_transformers"
]
},
{
"cell_type": "markdown",
"source": [
"## Send a direct question to gpt-3.5-turbo\n",
"\n",
"To query LLMs Haystack offers a simple class of components called \"Generators\". We want to use gpt-3.5-turbo for this example, so we use [`GPTGenerator`](https://docs.haystack.deepset.ai/v2.0/docs/gptgenerator?utm_campaign=developer-relations&utm_source=data-hour-event&utm_medium=webinar)."
],
"metadata": {
"id": "Ubo3JEG-UnNf"
}
},
{
"cell_type": "code",
"source": [
"import os\n",
"import getpass\n",
"\n",
"api_key = getpass.getpass()"
],
"metadata": {
"id": "NWcQYwOYQADg",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "423ae718-3733-415c-de86-672bf6a66051"
},
"execution_count": null,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"··········\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from haystack.components.generators import GPTGenerator\n",
"\n",
"generator = GPTGenerator(api_key=api_key)\n",
"\n",
"generator.run(prompt=\"Who is the CEO of OpenAI?\")"
],
"metadata": {
"id": "N-4JJYYai-i9",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "7b9a85cc-775f-4273-9618-9970e8b816e0"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'replies': ['As of 2021, the CEO of OpenAI is Sam Altman.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 15,\n",
" 'completion_tokens': 16,\n",
" 'total_tokens': 31}}]}"
]
},
"metadata": {},
"execution_count": 43
}
]
},
{
"cell_type": "markdown",
"source": [
"## Send the same query to Mixtral using HuggingFace TGI\n",
"\n",
"Haystack supports a variety of LLM providers. For example, we can also use [`HuggingFaceTGIGenerator`](https://docs.haystack.deepset.ai/v2.0/docs/huggingfacetgigenerator?utm_campaign=developer-relations&utm_source=data-hour-event&utm_medium=webinar) to send the same query to the latest model by MistralAI, Mixtral-8x7B-Instruct-v0.1."
],
"metadata": {
"id": "K6hCtZsAJlvS"
}
},
{
"cell_type": "code",
"source": [
"import os\n",
"import getpass\n",
"\n",
"hf_api_key = getpass.getpass()"
],
"metadata": {
"id": "MYZ7X3jvI4ek",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "cc317f9d-f665-4758-99c0-bf24f537145a"
},
"execution_count": null,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"··········\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from haystack.components.generators import HuggingFaceTGIGenerator\n",
"\n",
"generator = HuggingFaceTGIGenerator(\"mistralai/Mixtral-8x7B-Instruct-v0.1\", token=hf_api_key)\n",
"generator.warm_up()"
],
"metadata": {
"id": "FwTsUI_6JDCv"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"generator.run(\"Who is the CEO of OpenAI?\", generation_kwargs={\"max_new_tokens\": 350})"
],
"metadata": {
"id": "Hg-OEb-NJGFF",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "290fa104-67ca-4eb8-ea14-f06d62532d77"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'replies': ['\\n\\nSam Altman\\n\\nSam Altman is the CEO of OpenAI, a non-profit artificial intelligence research organization. He is also the co-chairman of the board and a former president of Y Combinator, a startup accelerator. Altman has a background in computer science and has been involved in several successful tech startups. He is known for his interest in artificial intelligence and its potential impact on society.'],\n",
" 'metadata': [{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1',\n",
" 'index': 0,\n",
" 'finish_reason': 'eos_token',\n",
" 'usage': {'completion_tokens': 88,\n",
" 'prompt_tokens': 8,\n",
" 'total_tokens': 96}}]}"
]
},
"metadata": {},
"execution_count": 46
}
]
},
{
"cell_type": "markdown",
"source": [
"## Create a modular prompt\n",
"\n",
"Our final prompt is going to be quite long. Haystack provides a component to assemble prompts dynamically, using [Jinja2's templating syntax](https://jinja.palletsprojects.com/en/3.1.x/templates/). This component is called [`PromptBuilder`](https://docs.haystack.deepset.ai/v2.0/docs/promptbuilder?utm_campaign=developer-relations&utm_source=data-hour-event&utm_medium=webinar)."
],
"metadata": {
"id": "kF6LrxvHVp9b"
}
},
{
"cell_type": "code",
"source": [
"from haystack.components.builders import PromptBuilder\n",
"\n",
"prompt_builder = PromptBuilder(template=\"Who is the CEO of {{ company }}?\")\n",
"\n",
"prompt_builder.run(company=\"OpenAI\")"
],
"metadata": {
"id": "2u_50rAUjpod",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "a7151c98-e203-4728-f5b2-424b1bad5630"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'prompt': 'Who is the CEO of OpenAI?'}"
]
},
"metadata": {},
"execution_count": 47
}
]
},
{
"cell_type": "markdown",
"source": [
"## Building a simple generative pipeline\n",
"\n",
"Now that we have a Generator and a [PromptBuilder](https://docs.haystack.deepset.ai/v2.0/docs/promptbuilder?utm_campaign=developer-relations&utm_source=data-hour-event&utm_medium=webinar), we can assemble our first pipeline by connecting them together."
],
"metadata": {
"id": "zfsbF9aqVtfX"
}
},
{
"cell_type": "code",
"source": [
"from haystack import Pipeline\n",
"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"prompt_builder\", PromptBuilder(template=\"Who is the CEO of {{ company }}?\"))\n",
"pipe.add_component(\"llm\", GPTGenerator(api_key=api_key))\n",
"pipe.connect(\"prompt_builder\", \"llm\")"
],
"metadata": {
"id": "14qTTokskoWJ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pipe.draw(\"simple-generative-pipeline.png\")"
],
"metadata": {
"id": "NWDO5EUgVGhN"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pipe.run({\"prompt_builder\": {\"company\": \"OpenAI\"}})"
],
"metadata": {
"id": "ActCxkHUMvH6",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "087e0b57-b291-4306-a1b6-6e929a47aa1f"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['As of October 2021, the CEO of OpenAI is Sam Altman.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 15,\n",
" 'completion_tokens': 17,\n",
" 'total_tokens': 32}}]}}"
]
},
"metadata": {},
"execution_count": 52
}
]
},
{
"cell_type": "markdown",
"source": [
"## Give more context to the LLM\n",
"\n",
"First, let's do the retrieval by hand and copy-paste some relevant information in the prompt."
],
"metadata": {
"id": "5ELhDczXWYPq"
}
},
{
"cell_type": "code",
"source": [
"pipe = Pipeline()\n",
"pipe.add_component(\"prompt_builder\", PromptBuilder(template=\"When was Sam Altman dismissed as CEO of {{ company }}?\"))\n",
"pipe.add_component(\"llm\", GPTGenerator(api_key=api_key))\n",
"pipe.connect(\"prompt_builder\", \"llm\")\n",
"\n",
"pipe.run({\"prompt_builder\": {\"company\": \"OpenAI\"}})"
],
"metadata": {
"id": "5Et5UzpqJHmt",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "b2a9ceea-27ea-48de-ab30-c51002ed9f67"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['Sam Altman was never dismissed as CEO of OpenAI. He served as the President of Y Combinator and later became the CEO of OpenAI. However, in March 1, 2021, he stepped down as CEO, and was succeeded by Sam Bankman-Fried, the founder and CEO of FTX.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 19,\n",
" 'completion_tokens': 66,\n",
" 'total_tokens': 85}}]}}"
]
},
"metadata": {},
"execution_count": 53
}
]
},
{
"cell_type": "code",
"source": [
"template = \"\"\"\n",
"Answer the question using the content of the text.\n",
"\n",
"Text: {{ context }}\n",
"\n",
"Question: {{ question }}\n",
"\"\"\"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"prompt_builder\", PromptBuilder(template=template))\n",
"pipe.add_component(\"llm\", GPTGenerator(api_key=api_key))\n",
"pipe.connect(\"prompt_builder\", \"llm\")"
],
"metadata": {
"id": "Gc2w5MD1WzzB"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pipe.draw(\"generative-pipeline-with-context.png\")"
],
"metadata": {
"id": "4itSQW0FN2P_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pipe.run({\n",
" \"prompt_builder\": {\n",
" \"question\": \"When was Sam Altman dismissed as CEO of OpenAI?\",\n",
" \"context\": \"\"\"Blog\n",
"OpenAI announces leadership transition\n",
"November 17, 2023\n",
"\n",
"Chief technology officer Mira Murati appointed interim CEO to lead OpenAI; Sam Altman departs the company.\n",
"\n",
"Search process underway to identify permanent successor.\n",
"\n",
"The board of directors of OpenAI, Inc., the 501(c)(3) that acts as the overall governing body for all OpenAI activities, today announced that Sam Altman will depart as CEO and leave the board of directors. Mira Murati, the company’s chief technology officer, will serve as interim CEO, effective immediately.\n",
"\n",
"A member of OpenAI’s leadership team for five years, Mira has played a critical role in OpenAI’s evolution into a global AI leader. She brings a unique skill set, understanding of the company’s values, operations, and business, and already leads the company’s research, product, and safety functions. Given her long tenure and close engagement with all aspects of the company, including her experience in AI governance and policy, the board believes she is uniquely qualified for the role and anticipates a seamless transition while it conducts a formal search for a permanent CEO.\n",
"\n",
"Mr. Altman’s departure follows a deliberative review process by the board, which concluded that he was not consistently candid in his communications with the board, hindering its ability to exercise its responsibilities. The board no longer has confidence in his ability to continue leading OpenAI.\n",
"\n",
"In a statement, the board of directors said: “OpenAI was deliberately structured to advance our mission: to ensure that artificial general intelligence benefits all humanity. The board remains fully committed to serving this mission. We are grateful for Sam’s many contributions to the founding and growth of OpenAI. At the same time, we believe new leadership is necessary as we move forward. As the leader of the company’s research, product, and safety functions, Mira is exceptionally qualified to step into the role of interim CEO. We have the utmost confidence in her ability to lead OpenAI during this transition period.”\n",
"\n",
"OpenAI’s board of directors consists of OpenAI chief scientist Ilya Sutskever, independent directors Quora CEO Adam D’Angelo, technology entrepreneur Tasha McCauley, and Georgetown Center for Security and Emerging Technology’s Helen Toner.\n",
"\n",
"As a part of this transition, Greg Brockman will be stepping down as chairman of the board and will remain in his role at the company, reporting to the CEO.\n",
"\n",
"OpenAI was founded as a non-profit in 2015 with the core mission of ensuring that artificial general intelligence benefits all of humanity. In 2019, OpenAI restructured to ensure that the company could raise capital in pursuit of this mission, while preserving the nonprofit's mission, governance, and oversight. The majority of the board is independent, and the independent directors do not hold equity in OpenAI. While the company has experienced dramatic growth, it remains the fundamental governance responsibility of the board to advance OpenAI’s mission and preserve the principles of its Charter.\n",
"\"\"\",\n",
" }\n",
"})"
],
"metadata": {
"id": "O2dmAVb_N0bF",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "b4415a74-1d05-4012-8146-2ab6cd1fd523"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['Sam Altman was dismissed as CEO of OpenAI on November 17, 2023.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 636,\n",
" 'completion_tokens': 19,\n",
" 'total_tokens': 655}}]}}"
]
},
"metadata": {},
"execution_count": 57
}
]
},
{
"cell_type": "markdown",
"source": [
"# Retrieving the context\n",
"\n",
"Now, let's see how can we manage the context in a programmatic way. One way is to store all the information in a Document Store. Haystack offers interfaces to many popular vector stores, plus a small toy implementation for demos called [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/v2.0/docs/document-store?utm_campaign=developer-relations&utm_source=data-hour-event&utm_medium=webinar), with the same API as the major ones.\n",
"\n",
"Once the data is stored, we can use a Retriever to get the most important data only. In this demo we are going to use a retriever based on BM25, but Haystack supports dense retrieval methods too."
],
"metadata": {
"id": "3V5sCS1bUqzS"
}
},
{
"cell_type": "code",
"source": [
"from haystack.dataclasses import Document\n",
"from haystack.document_stores import InMemoryDocumentStore\n",
"\n",
"documents = [\n",
" Document(content=\"OpenAI is an American artificial intelligence (AI) research organization consisting of the non-profit OpenAI, Inc.[5] registered in Delaware and its for-profit subsidiary OpenAI Global, LLC.[6]. At the time of writing, the CEO is Sam Altman.\"),\n",
" Document(content=\"July 5, 2021: Amazon founder Jeff Bezos will hand over his chief executive title to Andy Jassy, ending a more than two-decade run leading the company through its evolution from online bookseller to $1.75 trillion global retail, logistics and internet behemoth. \"),\n",
" Document(content=\"OpenAI announces leadership transition - November 17, 2023: Chief technology officer Mira Murati appointed interim CEO to lead OpenAI; Sam Altman departs the company. The board of directors of OpenAI, Inc., the 501(c)(3) that acts as the overall governing body for all OpenAI activities, today announced that Sam Altman will depart as CEO and leave the board of directors. Mira Murati, the company’s chief technology officer, will serve as interim CEO, effective immediately.\"),\n",
" Document(content=\"Sam Altman has been dismissed as the CEO of OpenAI. The company says it made the change after a “deliberative review process” by the board.\"),\n",
" Document(content=\"Twitter’s Jack Dorsey Steps Down From C.E.O. Role - November 29, 2021: The social media pioneer, whose name has become synonymous with the company, was replaced by Twitter’s chief technology officer, Parag Agrawal.\"),\n",
"]\n",
"docstore = InMemoryDocumentStore()\n",
"docstore.write_documents(documents=documents)\n",
"\n",
"docstore.filter_documents()"
],
"metadata": {
"id": "j0s5v7VCXhFo",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "a36f2c3f-66ae-4499-9bd9-5d28a8771a6e"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[Document(id=f7e4c686190b968ac992528f3ec6fa154f8dbdaeaa22474892d0401aeb801e2d, content: 'OpenAI is an American artificial intelligence (AI) research organization consisting of the non-profi...'),\n",
" Document(id=67a183be228fa15c73af7e79b720c3b23b2b7c21f339738c9f680c3eab5db90a, content: 'July 5, 2021: Amazon founder Jeff Bezos will hand over his chief executive title to Andy Jassy, endi...'),\n",
" Document(id=6aa4db8f0b516cc9aee252d5fa6bf2667ea12c0afaf8846d179cdffdf35ce67d, content: 'OpenAI announces leadership transition - November 17, 2023: Chief technology officer Mira Murati app...'),\n",
" Document(id=43f190800649253157cc7b9de4ac5b60efae00289d300be8e99b1d622c77985e, content: 'Sam Altman has been dismissed as the CEO of OpenAI. The company says it made the change after a “del...'),\n",
" Document(id=938669ff5b680c5334c019596df93e9602fc9f23851d4babf627d739dbc337c2, content: 'Twitter’s Jack Dorsey Steps Down From C.E.O. Role - November 29, 2021: The social media pioneer, who...')]"
]
},
"metadata": {},
"execution_count": 58
}
]
},
{
"cell_type": "code",
"source": [
"from haystack.components.retrievers.in_memory_bm25_retriever import InMemoryBM25Retriever\n",
"\n",
"retriever = InMemoryBM25Retriever(document_store=docstore)"
],
"metadata": {
"id": "hxpU-H_gXiNt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"retriever.run(query=\"When was Sam Altman dismissed as CEO of OpenAI?\", top_k=3)"
],
"metadata": {
"id": "J6va4mCuXiIU",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 153,
"referenced_widgets": [
"37a5fefd489a4ba1b187680e88e94c5d",
"04f605683a7145ae9c198c1aff60c8f3",
"479dd63e42fd443ca9565a902a64c865",
"d75b5856839d484ca75a73b1176f7d73",
"6a1db87bd49b4e6d9164250647315ee7",
"d485501d1efd46ae93e9efca5adc445b",
"bcec6b2cef3947828bb2298ecf9f1ed9",
"b7012a41560840a69edfc6fb65e4f5ac",
"ad4992cf76514fd4b5f9217b72cd985d",
"a4c633e1e57b4f51b4ed9e5c1b2014e5",
"3638dacfe22e43198435c6b12a7aa4f5"
]
},
"outputId": "94d7f191-9f97-4ff5-b4c6-6a318bc0505e"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"Ranking by BM25...: 0%| | 0/5 [00:00<?, ? docs/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "37a5fefd489a4ba1b187680e88e94c5d"
}
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'documents': [Document(id=43f190800649253157cc7b9de4ac5b60efae00289d300be8e99b1d622c77985e, content: 'Sam Altman has been dismissed as the CEO of OpenAI. The company says it made the change after a “del...', score: 3.121376680691861),\n",
" Document(id=6aa4db8f0b516cc9aee252d5fa6bf2667ea12c0afaf8846d179cdffdf35ce67d, content: 'OpenAI announces leadership transition - November 17, 2023: Chief technology officer Mira Murati app...', score: 1.947426182532291),\n",
" Document(id=f7e4c686190b968ac992528f3ec6fa154f8dbdaeaa22474892d0401aeb801e2d, content: 'OpenAI is an American artificial intelligence (AI) research organization consisting of the non-profi...', score: 1.3982592226116737)]}"
]
},
"metadata": {},
"execution_count": 60
}
]
},
{
"cell_type": "markdown",
"source": [
"## Our first RAG Pipeline\n",
"\n",
"By putting together the Retriever, the PromptBuilder and the Generator, we can finally assemble our RAG pipeline. Let's see how!"
],
"metadata": {
"id": "ZSDjbzQ-QADx"
}
},
{
"cell_type": "code",
"source": [
"template = \"\"\"\n",
"Answer the question using the content of the text.\n",
"\n",
"Text: {% for document in documents %}\n",
" - {{ document.content }}\n",
"{% endfor %}\n",
"\n",
"Question: {{ question }}\n",
"\"\"\"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"retriever\", InMemoryBM25Retriever(document_store=docstore))\n",
"pipe.add_component(\"prompt_builder\", PromptBuilder(template=template))\n",
"pipe.add_component(\"llm\", GPTGenerator(api_key=api_key))\n",
"pipe.connect(\"retriever\", \"prompt_builder.documents\")\n",
"pipe.connect(\"prompt_builder\", \"llm\")"
],
"metadata": {
"id": "eF_4D1wcQADx"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pipe.draw(\"simple-rag-pipeline.png\")"
],
"metadata": {
"id": "sTGHFYVxPJOc"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"question = \"When was Sam Altman dismissed as CEO of OpenAI?\"\n",
"pipe.run({\n",
" \"retriever\": {\"query\": question},\n",
" \"prompt_builder\": {\"question\": question}\n",
"})"
],
"metadata": {
"id": "hdSwGqYMQADx",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 171,
"referenced_widgets": [
"eb9ab61725f6485c903c4e87517ca82b",
"9a00817ab8914a13ae88c059af40241e",
"3fbf5f12c119403ca10a3c84890257e7",
"c179eb52519c4d1f98b3c568563bf445",
"432b2eeaf477483499d9cf81809c19c8",
"88cbc9c8281944ec96b02399dec7a338",
"b59da026371a416184923bd9dc150bc0",
"863f0a81c65244518620dc24717f8487",
"860c2a326e2d4792ac680ed8142bca9c",
"2332b3910fd2455595bfaee467b297a1",
"f92f3ab1793240aaa0a774af1c02cd8b"
]
},
"outputId": "a43c855a-decf-4b62-de15-a36a8995a713"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"Ranking by BM25...: 0%| | 0/5 [00:00<?, ? docs/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "eb9ab61725f6485c903c4e87517ca82b"
}
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['Sam Altman was dismissed as CEO of OpenAI on November 17, 2023.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 346,\n",
" 'completion_tokens': 19,\n",
" 'total_tokens': 365}}]}}"
]
},
"metadata": {},
"execution_count": 63
}
]
},
{
"cell_type": "markdown",
"source": [
"## Searching the Web\n",
"\n",
"Collecting data into a Document Store may not be the right approach for our application. If we want to retrieve from the Web, either from Google or from a specific website (our documentation, our Notion, and so on) it's easier to retrieve directly at the source with a search engine.\n",
"\n",
"Haystack offers integration with some API search engines too. For this demo we will use SerperDev with [`SerperDevWebSearch`](https://docs.haystack.deepset.ai/v2.0/docs/serperdevwebsearch?utm_campaign=developer-relations&utm_source=data-hour-event&utm_medium=webinar)"
],
"metadata": {
"id": "PxA-eYr6Rpi2"
}
},
{
"cell_type": "code",
"source": [
"import getpass\n",
"\n",
"serperdev_api_key = getpass.getpass()"
],
"metadata": {
"id": "9y3QdMTxRrXS",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "08b581e4-0d1a-4d7b-b374-7869a9360580"
},
"execution_count": null,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"··········\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from haystack.components.websearch import SerperDevWebSearch\n",
"\n",
"search = SerperDevWebSearch(api_key=serperdev_api_key)\n",
"\n",
"results = search.run(query=\"When was Sam Altman dismissed as CEO of OpenAI?\")"
],
"metadata": {
"id": "egYr2z6mRzPE"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"for document in results[\"documents\"]:\n",
" print(\" - \", document.content)"
],
"metadata": {
"id": "Sf2wsH3MSerF",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "578e777c-eaf1-4847-adac-73cc4dcab2af"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" - November 17\n",
" - Sam Altman has been fired as the CEO of OpenAI. The company says it made the change after a “deliberative review process” by the board.\n",
" - The OpenAI board ousted Altman, then attempted to rehire him, then reportedly balked at his terms — and then Microsoft hired him instead.\n",
" - Less than a week after he was fired as CEO by the OpenAI board, Sam Altman is back running the artificial intelligence startup.\n",
" - Date. Event ; Nov. 17. OpenAI board fires CEO, co-founder Sam Altman and President Greg Brockman quits after being removed from the board.\n",
" - Friday, November 17 – Altman stepped down from his role as CEO of OpenAI, the company announced, following a review undertaken by the company's ...\n",
" - UPDATE: Nov. 20, 2023, 9:33 a.m. EST Altman is replaced by OpenAI interim CEO Emmett Shear. Sam Altman has been fired from his position as ...\n",
" - OpenAI President Greg Brockman said he had decided to leave the company, as well, apparently in response to Altman's firing by the board.\n",
" - When 38-year-old Sam Altman, the co-founder and CEO of OpenAI, was fired, parallels were drawn between his departure from the ChatGPT maker ...\n",
" - OpenAI announced in a tweet Tuesday evening that it had “reached an agreement in principle” for ousted CEO Sam Altman to return to the company.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"results[\"links\"]"
],
"metadata": {
"id": "9C5r_f05SgFr",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "daa5e9ff-ebe3-474f-d479-e396048e3a7a"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['https://www.theverge.com/2023/11/17/23965982/openai-ceo-sam-altman-fired',\n",
" 'https://nymag.com/intelligencer/2023/11/why-was-sam-altman-fired-as-ceo-of-openai.html',\n",
" 'https://www.cnbc.com/2023/11/22/openai-brings-sam-altman-back-as-ceo-days-after-ouster.html',\n",
" 'https://www.reuters.com/technology/openai-ouster-microsoft-ai-research-ceo-sam-altmans-tumultuous-weekend-2023-11-20/',\n",
" 'https://abcnews.go.com/Business/sam-altman-reaches-deal-return-ceo-openai/story?id=105091534',\n",
" 'https://mashable.com/article/sam-altman-fired-openai-ceo',\n",
" 'https://www.barrons.com/articles/sam-altman-fired-openai-ceo-msft-2e2afd13',\n",
" 'https://indianexpress.com/article/technology/techook/everything-you-did-not-know-about-sam-altman-openais-sacked-ceo-9034423/',\n",
" 'https://www.forbes.com/sites/sarahemerson/2023/11/22/openai-announces-plan-for-fired-ceo-sam-altman-to-return-to-the-company/?sh=734fe4747e83']"
]
},
"metadata": {},
"execution_count": 67
}
]
},
{
"cell_type": "markdown",
"source": [
"## A minimal Web RAG Pipeline\n",
"\n",
"Haystack's strenght is the flexibility of the pipeline and the ease of connecting components. In this example, see how we can take the exact same pipeline as before and, by simply swapping the retriever with the search engine, we can immediately do Web-based RAG."
],
"metadata": {
"id": "0aRCL7YyX70Q"
}
},
{
"cell_type": "code",
"source": [
"template = \"\"\"\n",
"Answer the question using the content of the text.\n",
"\n",
"Text: {% for document in documents %}\n",
" - {{ document.content }}\n",
"{% endfor %}\n",
"\n",
"Question: {{ question }}\n",
"\"\"\"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"search\", SerperDevWebSearch(api_key=serperdev_api_key))\n",
"pipe.add_component(\"prompt_builder\", PromptBuilder(template=template))\n",
"pipe.add_component(\"llm\", GPTGenerator(api_key=api_key))\n",
"pipe.connect(\"search.documents\", \"prompt_builder.documents\")\n",
"pipe.connect(\"prompt_builder\", \"llm\")"
],
"metadata": {
"id": "jFmhoTfNXh7e"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pipe.draw(\"web-rag-pipeline.png\")"
],
"metadata": {
"id": "XkS0ozuZXhfV"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"question = \"When was Sam Altman dismissed as CEO of OpenAI?\"\n",
"\n",
"pipe.run({\n",
" \"search\": {\"query\": question},\n",
" \"prompt_builder\": {\"question\": question}\n",
"})"
],
"metadata": {
"id": "6LxRLPShV7kn",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "446a2926-2c67-47e8-ef9f-b9cd5be0807f"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'search': {'links': ['https://www.theverge.com/2023/11/17/23965982/openai-ceo-sam-altman-fired',\n",
" 'https://nymag.com/intelligencer/2023/11/why-was-sam-altman-fired-as-ceo-of-openai.html',\n",
" 'https://www.cnbc.com/2023/11/22/openai-brings-sam-altman-back-as-ceo-days-after-ouster.html',\n",
" 'https://abcnews.go.com/Business/sam-altman-reaches-deal-return-ceo-openai/story?id=105091534',\n",
" 'https://www.forbes.com/sites/sarahemerson/2023/11/22/openai-announces-plan-for-fired-ceo-sam-altman-to-return-to-the-company/?sh=734fe4747e83',\n",
" 'https://mashable.com/article/sam-altman-fired-openai-ceo',\n",
" 'https://indianexpress.com/article/technology/techook/everything-you-did-not-know-about-sam-altman-openais-sacked-ceo-9034423/',\n",
" 'https://www.barrons.com/articles/sam-altman-fired-openai-ceo-msft-2e2afd13',\n",
" 'https://cointelegraph.com/news/sam-altman-fired-from-openai-timeline']},\n",
" 'llm': {'replies': ['Sam Altman was dismissed as CEO of OpenAI on Friday, November 17.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 350,\n",
" 'completion_tokens': 17,\n",
" 'total_tokens': 367}}]}}"
]
},
"metadata": {},
"execution_count": 70
}
]
},
{
"cell_type": "markdown",
"source": [
"# Create a custom component\n",
"\n",
"If you want to retrieve data from something else entirely, you can create a custom component.\n",
"\n",
"For example, this is a topic where we may want to retrieve some links from HackerNews instead of Google. While this is also possible with SerperDev API, let's try to make a simple custom component to get links on HackerNews directly.\n",
"\n",
"[Build a custom `HackerNewsFetcher`](https://haystack.deepset.ai/blog/customizing-rag-to-summarize-hacker-news-posts-with-haystack2?utm_campaign=developer-relations&utm_source=data-hour-event&utm_medium=webinar)"
],
"metadata": {
"id": "-fYMyBBm8y3D"
}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment