Skip to content

Instantly share code, notes, and snippets.

@ZanSara
Last active November 21, 2023 15:59
Show Gist options
  • Save ZanSara/cad6f772d3a894058db34f566e2c4042 to your computer and use it in GitHub Desktop.
Save ZanSara/cad6f772d3a894058db34f566e2c4042 to your computer and use it in GitHub Desktop.
RAG Pipelines from scratch (0.88.0).ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"collapsed_sections": [
"Ubo3JEG-UnNf",
"kF6LrxvHVp9b",
"zfsbF9aqVtfX",
"5ELhDczXWYPq",
"0aRCL7YyX70Q"
],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"03e7c0507eab4c0a8e927a4bd1e187b6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_815b28e89dec4e3581b07624cb3dfb1e",
"IPY_MODEL_8ade7c82cb974d488df6e9dfcaee593f",
"IPY_MODEL_fff12707ed6840babed92a200024cc85"
],
"layout": "IPY_MODEL_f3d8b41ade2d426aa826b95762f6a530"
}
},
"815b28e89dec4e3581b07624cb3dfb1e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_32f8deec4e9e457ab7cad7ea80c2dfac",
"placeholder": "​",
"style": "IPY_MODEL_9c484161937c405fba2097f3d6282ab2",
"value": "Ranking by BM25...: 100%"
}
},
"8ade7c82cb974d488df6e9dfcaee593f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_57c9f2d777a444d7b8c99b905c8b4761",
"max": 4,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_8db87770309d40ffa9352bf479b806f8",
"value": 4
}
},
"fff12707ed6840babed92a200024cc85": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_71e5f95d9da041a8b92f562e1e541787",
"placeholder": "​",
"style": "IPY_MODEL_e1178ea235074c1682abd789aaeb4bc6",
"value": " 4/4 [00:00<00:00, 91.69 docs/s]"
}
},
"f3d8b41ade2d426aa826b95762f6a530": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"32f8deec4e9e457ab7cad7ea80c2dfac": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"9c484161937c405fba2097f3d6282ab2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"57c9f2d777a444d7b8c99b905c8b4761": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"8db87770309d40ffa9352bf479b806f8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"71e5f95d9da041a8b92f562e1e541787": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"e1178ea235074c1682abd789aaeb4bc6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"bb8b5e2527fd4487916c0ed147e48a56": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_b26fcf8421734fdcb8c95d3380af7796",
"IPY_MODEL_24a5f821ec7640b78cf05c901022e82d",
"IPY_MODEL_a35b5290a5a245ceb55d3d5c11240126"
],
"layout": "IPY_MODEL_d5a0e9767ccc4fbfa10b416b4798ed57"
}
},
"b26fcf8421734fdcb8c95d3380af7796": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_0105b02b9253453c8eede94698f3076d",
"placeholder": "​",
"style": "IPY_MODEL_0c0cc86ef4834deb8a23650545a555d3",
"value": "Ranking by BM25...: 100%"
}
},
"24a5f821ec7640b78cf05c901022e82d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_3b0744f1401a468ba192dfb645186c77",
"max": 4,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_e0821f777f7349a185f6550825558717",
"value": 4
}
},
"a35b5290a5a245ceb55d3d5c11240126": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_9342fd9bcb5e44459fd8671ff57a89f0",
"placeholder": "​",
"style": "IPY_MODEL_4b2d278f9c4c49d896c8153fd5d917b2",
"value": " 4/4 [00:00<00:00, 96.82 docs/s]"
}
},
"d5a0e9767ccc4fbfa10b416b4798ed57": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0105b02b9253453c8eede94698f3076d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0c0cc86ef4834deb8a23650545a555d3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"3b0744f1401a468ba192dfb645186c77": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"e0821f777f7349a185f6550825558717": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"9342fd9bcb5e44459fd8671ff57a89f0": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"4b2d278f9c4c49d896c8153fd5d917b2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"3f60a91b4d2a44199e401de42889b087": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_a7a755ed9f354de1be2f14043b32442b",
"IPY_MODEL_9b4caaa3363d4cedbfb32adc2e9afc04",
"IPY_MODEL_5283b11e674a48c884065b2a287bc1f8"
],
"layout": "IPY_MODEL_511a7dd815d44d22aee2107d5a4e61c5"
}
},
"a7a755ed9f354de1be2f14043b32442b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_dc3fdf05b5e74262813374b63bc275c1",
"placeholder": "​",
"style": "IPY_MODEL_e2b97c61feed40ec911bd9a1c85ea59f",
"value": "Ranking by BM25...: 100%"
}
},
"9b4caaa3363d4cedbfb32adc2e9afc04": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_6a7e7e96e41e43f4a727d8545bd31f87",
"max": 4,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_5577103f8f0f4614822111d23cf7fe4a",
"value": 4
}
},
"5283b11e674a48c884065b2a287bc1f8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_fc7a22b3e46948e8a0398ae91607605b",
"placeholder": "​",
"style": "IPY_MODEL_c2dc0799f5b44601a3d290d0284785fb",
"value": " 4/4 [00:00<00:00, 134.93 docs/s]"
}
},
"511a7dd815d44d22aee2107d5a4e61c5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"dc3fdf05b5e74262813374b63bc275c1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"e2b97c61feed40ec911bd9a1c85ea59f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"6a7e7e96e41e43f4a727d8545bd31f87": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"5577103f8f0f4614822111d23cf7fe4a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"fc7a22b3e46948e8a0398ae91607605b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"c2dc0799f5b44601a3d290d0284785fb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ZanSara/cad6f772d3a894058db34f566e2c4042/rag_pipelines_from_scratch_to_production.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# Install the libraries"
],
"metadata": {
"id": "n1DbBwkqX-ax"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "gixkExYMh9cA",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "f5d5324f-18ec-47c7-cc72-661cb9b78083"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting posthog\n",
" Downloading posthog-3.0.2-py2.py3-none-any.whl (37 kB)\n",
"Requirement already satisfied: requests<3.0,>=2.7 in /usr/local/lib/python3.10/dist-packages (from posthog) (2.31.0)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog) (1.16.0)\n",
"Collecting monotonic>=1.5 (from posthog)\n",
" Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n",
"Collecting backoff>=1.10.0 (from posthog)\n",
" Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
"Requirement already satisfied: python-dateutil>2.1 in /usr/local/lib/python3.10/dist-packages (from posthog) (2.8.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.7->posthog) (3.3.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.7->posthog) (3.4)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.7->posthog) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.7->posthog) (2023.7.22)\n",
"Installing collected packages: monotonic, backoff, posthog\n",
"Successfully installed backoff-2.2.1 monotonic-1.6 posthog-3.0.2\n",
"Collecting langdetect\n",
" Downloading langdetect-1.0.9.tar.gz (981 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m981.5/981.5 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from langdetect) (1.16.0)\n",
"Building wheels for collected packages: langdetect\n",
" Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993224 sha256=111d9f677a0746fe63d9b8de840ea29dbc47dfd5b6862954ba71d545a8c8065c\n",
" Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106\n",
"Successfully built langdetect\n",
"Installing collected packages: langdetect\n",
"Successfully installed langdetect-1.0.9\n",
"Collecting boilerpy3\n",
" Downloading boilerpy3-1.0.6-py3-none-any.whl (22 kB)\n",
"Installing collected packages: boilerpy3\n",
"Successfully installed boilerpy3-1.0.6\n",
"Collecting transformers[sentencepiece,torch]==4.34.1\n",
" Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.7/7.7 MB\u001b[0m \u001b[31m50.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece,torch]==4.34.1) (3.12.4)\n",
"Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[sentencepiece,torch]==4.34.1)\n",
" Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.0/302.0 kB\u001b[0m \u001b[31m32.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece,torch]==4.34.1) (1.23.5)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece,torch]==4.34.1) (23.2)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece,torch]==4.34.1) (6.0.1)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece,torch]==4.34.1) (2023.6.3)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece,torch]==4.34.1) (2.31.0)\n",
"Collecting tokenizers<0.15,>=0.14 (from transformers[sentencepiece,torch]==4.34.1)\n",
" Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m79.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting safetensors>=0.3.1 (from transformers[sentencepiece,torch]==4.34.1)\n",
" Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m79.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece,torch]==4.34.1) (4.66.1)\n",
"Requirement already satisfied: torch!=1.12.0,>=1.10 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece,torch]==4.34.1) (2.1.0+cu118)\n",
"Collecting accelerate>=0.20.3 (from transformers[sentencepiece,torch]==4.34.1)\n",
" Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.0/261.0 kB\u001b[0m \u001b[31m29.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[sentencepiece,torch]==4.34.1)\n",
" Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m58.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece,torch]==4.34.1) (3.20.3)\n",
"Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.20.3->transformers[sentencepiece,torch]==4.34.1) (5.9.5)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers[sentencepiece,torch]==4.34.1) (2023.6.0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers[sentencepiece,torch]==4.34.1) (4.5.0)\n",
"Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[sentencepiece,torch]==4.34.1)\n",
" Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m25.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.10->transformers[sentencepiece,torch]==4.34.1) (1.12)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.10->transformers[sentencepiece,torch]==4.34.1) (3.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.10->transformers[sentencepiece,torch]==4.34.1) (3.1.2)\n",
"Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.10->transformers[sentencepiece,torch]==4.34.1) (2.1.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[sentencepiece,torch]==4.34.1) (3.3.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[sentencepiece,torch]==4.34.1) (3.4)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[sentencepiece,torch]==4.34.1) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[sentencepiece,torch]==4.34.1) (2023.7.22)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch!=1.12.0,>=1.10->transformers[sentencepiece,torch]==4.34.1) (2.1.3)\n",
"Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch!=1.12.0,>=1.10->transformers[sentencepiece,torch]==4.34.1) (1.3.0)\n",
"Installing collected packages: sentencepiece, safetensors, huggingface-hub, tokenizers, accelerate, transformers\n",
"Successfully installed accelerate-0.24.0 huggingface-hub-0.17.3 safetensors-0.4.0 sentencepiece-0.1.99 tokenizers-0.14.1 transformers-4.34.1\n",
"Collecting haystack-ai==0.88.0\n",
" Downloading haystack_ai-0.88.0-py3-none-any.whl (98 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.5/98.5 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting canals==0.8.1 (from haystack-ai==0.88.0)\n",
" Downloading canals-0.8.1-py3-none-any.whl (32 kB)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from haystack-ai==0.88.0) (3.1.2)\n",
"Collecting lazy-imports (from haystack-ai==0.88.0)\n",
" Downloading lazy_imports-0.3.1-py3-none-any.whl (12 kB)\n",
"Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from haystack-ai==0.88.0) (10.1.0)\n",
"Collecting openai (from haystack-ai==0.88.0)\n",
" Downloading openai-0.28.1-py3-none-any.whl (76 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.0/77.0 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from haystack-ai==0.88.0) (1.5.3)\n",
"Requirement already satisfied: posthog in /usr/local/lib/python3.10/dist-packages (from haystack-ai==0.88.0) (3.0.2)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from haystack-ai==0.88.0) (6.0.1)\n",
"Collecting rank-bm25 (from haystack-ai==0.88.0)\n",
" Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from haystack-ai==0.88.0) (2.31.0)\n",
"Requirement already satisfied: tenacity in /usr/local/lib/python3.10/dist-packages (from haystack-ai==0.88.0) (8.2.3)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from haystack-ai==0.88.0) (4.66.1)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from canals==0.8.1->haystack-ai==0.88.0) (3.2)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->haystack-ai==0.88.0) (2.1.3)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from openai->haystack-ai==0.88.0) (3.8.6)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai==0.88.0) (3.3.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai==0.88.0) (3.4)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai==0.88.0) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai==0.88.0) (2023.7.22)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai==0.88.0) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai==0.88.0) (2023.3.post1)\n",
"Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai==0.88.0) (1.23.5)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai==0.88.0) (1.16.0)\n",
"Requirement already satisfied: monotonic>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai==0.88.0) (1.6)\n",
"Requirement already satisfied: backoff>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai==0.88.0) (2.2.1)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai==0.88.0) (23.1.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai==0.88.0) (6.0.4)\n",
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai==0.88.0) (4.0.3)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai==0.88.0) (1.9.2)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai==0.88.0) (1.4.0)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai==0.88.0) (1.3.1)\n",
"Installing collected packages: rank-bm25, lazy-imports, canals, openai, haystack-ai\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"llmx 0.0.15a0 requires cohere, which is not installed.\n",
"llmx 0.0.15a0 requires tiktoken, which is not installed.\u001b[0m\u001b[31m\n",
"\u001b[0mSuccessfully installed canals-0.8.1 haystack-ai-0.88.0 lazy-imports-0.3.1 openai-0.28.1 rank-bm25-0.2.2\n"
]
}
],
"source": [
"# Install haystack & some deps\n",
"%pip install posthog\n",
"%pip install langdetect\n",
"%pip install boilerpy3\n",
"%pip install transformers[torch,sentencepiece]==4.34.1\n",
"%pip install haystack-ai==0.88.0"
]
},
{
"cell_type": "code",
"source": [
"# Get OpenAI API key\n",
"\n",
"import getpass\n",
"\n",
"api_key = getpass.getpass()"
],
"metadata": {
"id": "wzdNlKb2To-z",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "85ed8154-eec2-48b7-b9e5-feb042382c4b"
},
"execution_count": null,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"··········\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Generators"
],
"metadata": {
"id": "Ubo3JEG-UnNf"
}
},
{
"cell_type": "code",
"source": [
"from haystack.preview.components.generators.openai.gpt import GPTGenerator\n",
"\n",
"generator = GPTGenerator(api_key=api_key)\n",
"\n",
"generator.run(prompt=\"What's the official language of France?\")"
],
"metadata": {
"id": "N-4JJYYai-i9",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "27d13641-5425-4fcd-bab5-81d9df532790"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'replies': ['The official language of France is French.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 15,\n",
" 'completion_tokens': 8,\n",
" 'total_tokens': 23}}]}"
]
},
"metadata": {},
"execution_count": 3
}
]
},
{
"cell_type": "markdown",
"source": [
"# PromptBuilder"
],
"metadata": {
"id": "kF6LrxvHVp9b"
}
},
{
"cell_type": "code",
"source": [
"from haystack.preview.components.builders.prompt_builder import PromptBuilder\n",
"\n",
"prompt_builder = PromptBuilder(template=\"What's the official language of {{ country }}?\")\n",
"\n",
"prompt_builder.run(country=\"France\")"
],
"metadata": {
"id": "2u_50rAUjpod",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "2c70639b-638d-40c7-c129-266b5ea9974d"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'prompt': \"What's the official language of France?\"}"
]
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "markdown",
"source": [
"# A Simple Generative Pipeline"
],
"metadata": {
"id": "zfsbF9aqVtfX"
}
},
{
"cell_type": "code",
"source": [
"from haystack.preview import Pipeline\n",
"from haystack.preview.components.generators.openai.gpt import GPTGenerator\n",
"from haystack.preview.components.builders.prompt_builder import PromptBuilder\n",
"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"prompt_builder\", PromptBuilder(template=\"What's the official language of {{ country }}?\"))\n",
"pipe.add_component(\"llm\", GPTGenerator(api_key=api_key))\n",
"pipe.connect(\"prompt_builder\", \"llm\")\n",
"\n",
"pipe.run({\"prompt_builder\": {\"country\": \"France\"}})"
],
"metadata": {
"id": "14qTTokskoWJ",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "02874048-2df4-4c20-9457-2888bd9867cb"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['The official language of France is French.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 15,\n",
" 'completion_tokens': 8,\n",
" 'total_tokens': 23}}]}}"
]
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"source": [
"pipe.draw(\"simple-llm-pipeline.png\")"
],
"metadata": {
"id": "NWDO5EUgVGhN",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "f2faf77a-8c21-4241-e63b-27c0fe408966"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'prompt_builder': \"country: InputSocket(name='country', type=typing.Any, is_optional=False, sender=None)\", 'llm': \"prompt: InputSocket(name='prompt', type=<class 'str'>, is_optional=False, sender='prompt_builder')\"}\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Make the LLM cheat"
],
"metadata": {
"id": "5ELhDczXWYPq"
}
},
{
"cell_type": "code",
"source": [
"pipe.run({\"prompt_builder\": {\"country\": \"the Republic of Rose Island\"}})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oIEB_HfHWIFC",
"outputId": "e22f0d4d-ee5a-4c41-b631-ed1688873ace"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['The official language of the Republic of Rose Island was Italian.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 19,\n",
" 'completion_tokens': 12,\n",
" 'total_tokens': 31}}]}}"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"source": [
"context_template = \"\"\"\n",
"Given the following information, answer the question.\n",
"Context: {{ context }}\n",
"Question: {{ question }}\n",
"\"\"\"\n",
"language_template = \"What's the official language of {{ country }}?\"\n",
"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"context_prompt\", PromptBuilder(template=context_template))\n",
"pipe.add_component(\"language_prompt\", PromptBuilder(template=language_template))\n",
"pipe.add_component(\"llm\", GPTGenerator(api_key=api_key))\n",
"pipe.connect(\"language_prompt\", \"context_prompt.question\")\n",
"pipe.connect(\"context_prompt\", \"llm\")\n",
"\n",
"pipe.run({\n",
" \"context_prompt\": {\"context\": \"Rose Island had its own government, currency, post office, and commercial establishments, and the official language was Esperanto.\"},\n",
" \"language_prompt\": {\"country\": \"the Republic of Rose Island\"}\n",
"})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6UaQ8n03s2rO",
"outputId": "641c203d-9cf1-414b-ed0a-855303e76fe8"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['The official language of the Republic of Rose Island is Esperanto.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 57,\n",
" 'completion_tokens': 13,\n",
" 'total_tokens': 70}}]}}"
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"source": [
"pipe.draw(\"double-prompt-builder-pipeline.png\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ue-_-skjWh-Q",
"outputId": "478257e8-7335-48a1-cf16-c8f58659c2b2"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'context_prompt': \"context: InputSocket(name='context', type=typing.Any, is_optional=False, sender=None)\\nquestion: InputSocket(name='question', type=typing.Any, is_optional=False, sender='language_prompt')\", 'language_prompt': \"country: InputSocket(name='country', type=typing.Any, is_optional=False, sender=None)\", 'llm': \"prompt: InputSocket(name='prompt', type=<class 'str'>, is_optional=False, sender='context_prompt')\"}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"template = \"\"\"\n",
"Given the following information, answer the question.\n",
"Context: {{ context }}\n",
"Question: What's the official language of {{ country }}?\n",
"\"\"\"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"prompt_builder\", PromptBuilder(template=template))\n",
"pipe.add_component(\"llm\", GPTGenerator(api_key=api_key))\n",
"pipe.connect(\"prompt_builder\", \"llm\")\n",
"\n",
"pipe.run({\n",
" \"prompt_builder\": {\n",
" \"context\": \"Rose Island had its own government, currency, post office, and commercial establishments, and the official language was Esperanto.\",\n",
" \"country\": \"the Republic of Rose Island\"\n",
" }\n",
"})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Gc2w5MD1WzzB",
"outputId": "4652853c-326c-4318-9c57-fe3e60fa88c7"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['The official language of the Republic of Rose Island is Esperanto.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 57,\n",
" 'completion_tokens': 13,\n",
" 'total_tokens': 70}}]}}"
]
},
"metadata": {},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"source": [
"pipe.draw(\"advanced-prompt-builder-pipeline.png\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-simX98cW8SL",
"outputId": "b555cb39-64aa-405a-fd3c-d1df3eb5af9a"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'prompt_builder': \"context: InputSocket(name='context', type=typing.Any, is_optional=False, sender=None)\\ncountry: InputSocket(name='country', type=typing.Any, is_optional=False, sender=None)\", 'llm': \"prompt: InputSocket(name='prompt', type=<class 'str'>, is_optional=False, sender='prompt_builder')\"}\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Retrieving the context"
],
"metadata": {
"id": "3V5sCS1bUqzS"
}
},
{
"cell_type": "code",
"source": [
"from haystack.preview.dataclasses import Document\n",
"from haystack.preview.document_stores import InMemoryDocumentStore\n",
"\n",
"documents = [\n",
" Document(text=\"German is the the official language of Germany.\"),\n",
" Document(text=\"The capital of France is Paris, and its official language is French.\"),\n",
" Document(text=\"Italy recognizes a few official languages, but the most widespread one is Italian.\"),\n",
" Document(text=\"Esperanto has been adopted as official language for some microstates as well, such as the Republic of Rose Island, a short-lived microstate built on a sea platform in the Adriatic Sea.\")\n",
"]\n",
"docstore = InMemoryDocumentStore()\n",
"docstore.write_documents(documents=documents)\n",
"\n",
"docstore.filter_documents()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "j0s5v7VCXhFo",
"outputId": "394fc538-bd6e-468d-f440-845118cefaae"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[Document(id='e2afcf2a41f8075f26cfd0c125327306fcedf7e8633c9f381a9622e0708f9e59', text='German is the the official language of Germany.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=None),\n",
" Document(id='08e67d3889bc91771ba65eeea4df53269a3bb6d0bbe483593b4364ef34867a6b', text='The capital of France is Paris, and its official language is French.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=None),\n",
" Document(id='e13f61547d3505967902abf4e9aeaf2bd4ec853feca6681676f66773493f3cb5', text='Italy recognizes a few official languages, but the most widespread one is Italian.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=None),\n",
" Document(id='cc929738998c7cfc223fefb396c3304906e774c85622ef8861ceb06de9acb02e', text='Esperanto has been adopted as official language for some microstates as well, such as the Republic of Rose Island, a short-lived microstate built on a sea platform in the Adriatic Sea.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=None)]"
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"source": [
"from haystack.preview.components.retrievers.in_memory_bm25_retriever import InMemoryBM25Retriever\n",
"\n",
"retriever = InMemoryBM25Retriever(document_store=docstore)"
],
"metadata": {
"id": "hxpU-H_gXiNt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"retriever.run(query=\"Rose Island\", top_k=1)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 935,
"referenced_widgets": [
"03e7c0507eab4c0a8e927a4bd1e187b6",
"815b28e89dec4e3581b07624cb3dfb1e",
"8ade7c82cb974d488df6e9dfcaee593f",
"fff12707ed6840babed92a200024cc85",
"f3d8b41ade2d426aa826b95762f6a530",
"32f8deec4e9e457ab7cad7ea80c2dfac",
"9c484161937c405fba2097f3d6282ab2",
"57c9f2d777a444d7b8c99b905c8b4761",
"8db87770309d40ffa9352bf479b806f8",
"71e5f95d9da041a8b92f562e1e541787",
"e1178ea235074c1682abd789aaeb4bc6"
]
},
"id": "J6va4mCuXiIU",
"outputId": "5f2d918c-4f86-402a-b84b-ac0d56eca74a"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"Ranking by BM25...: 0%| | 0/4 [00:00<?, ? docs/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "03e7c0507eab4c0a8e927a4bd1e187b6"
}
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'documents': [Document(id='cc929738998c7cfc223fefb396c3304906e774c85622ef8861ceb06de9acb02e', text='Esperanto has been adopted as official language for some microstates as well, such as the Republic of Rose Island, a short-lived microstate built on a sea platform in the Adriatic Sea.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=0.537198780872107)]}"
]
},
"metadata": {},
"execution_count": 14
}
]
},
{
"cell_type": "code",
"source": [
"retriever.run(query=\"Rose Island\", top_k=3)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000,
"referenced_widgets": [
"bb8b5e2527fd4487916c0ed147e48a56",
"b26fcf8421734fdcb8c95d3380af7796",
"24a5f821ec7640b78cf05c901022e82d",
"a35b5290a5a245ceb55d3d5c11240126",
"d5a0e9767ccc4fbfa10b416b4798ed57",
"0105b02b9253453c8eede94698f3076d",
"0c0cc86ef4834deb8a23650545a555d3",
"3b0744f1401a468ba192dfb645186c77",
"e0821f777f7349a185f6550825558717",
"9342fd9bcb5e44459fd8671ff57a89f0",
"4b2d278f9c4c49d896c8153fd5d917b2"
]
},
"id": "iWpwPT4JXiCY",
"outputId": "972d6a92-2ff0-467a-8550-871a2be0f2e1"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"Ranking by BM25...: 0%| | 0/4 [00:00<?, ? docs/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "bb8b5e2527fd4487916c0ed147e48a56"
}
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'documents': [Document(id='cc929738998c7cfc223fefb396c3304906e774c85622ef8861ceb06de9acb02e', text='Esperanto has been adopted as official language for some microstates as well, such as the Republic of Rose Island, a short-lived microstate built on a sea platform in the Adriatic Sea.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=0.537198780872107),\n",
" Document(id='e13f61547d3505967902abf4e9aeaf2bd4ec853feca6681676f66773493f3cb5', text='Italy recognizes a few official languages, but the most widespread one is Italian.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=0.5),\n",
" Document(id='08e67d3889bc91771ba65eeea4df53269a3bb6d0bbe483593b4364ef34867a6b', text='The capital of France is Paris, and its official language is French.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=0.5)]}"
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"source": [
"# Our first RAG Pipeline"
],
"metadata": {
"id": "0aRCL7YyX70Q"
}
},
{
"cell_type": "code",
"source": [
"template = \"\"\"\n",
"Given the following information, answer the question.\n",
"\n",
"Context:\n",
"{% for document in documents %}\n",
" {{ document.text }}\n",
"{% endfor %}\n",
"\n",
"Question: What's the official language of {{ country }}?\n",
"\"\"\"\n",
"pipe = Pipeline()\n",
"\n",
"pipe.add_component(\"retriever\", InMemoryBM25Retriever(document_store=docstore))\n",
"pipe.add_component(\"prompt_builder\", PromptBuilder(template=template))\n",
"pipe.add_component(\"llm\", GPTGenerator(api_key=api_key))\n",
"pipe.connect(\"retriever\", \"prompt_builder.documents\")\n",
"pipe.connect(\"prompt_builder\", \"llm\")\n",
"\n",
"country = \"the Republic of Rose Island\"\n",
"pipe.run({\n",
" \"retriever\": {\"query\": country},\n",
" \"prompt_builder\": {\"country\": country}\n",
"})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 761,
"referenced_widgets": [
"3f60a91b4d2a44199e401de42889b087",
"a7a755ed9f354de1be2f14043b32442b",
"9b4caaa3363d4cedbfb32adc2e9afc04",
"5283b11e674a48c884065b2a287bc1f8",
"511a7dd815d44d22aee2107d5a4e61c5",
"dc3fdf05b5e74262813374b63bc275c1",
"e2b97c61feed40ec911bd9a1c85ea59f",
"6a7e7e96e41e43f4a727d8545bd31f87",
"5577103f8f0f4614822111d23cf7fe4a",
"fc7a22b3e46948e8a0398ae91607605b",
"c2dc0799f5b44601a3d290d0284785fb"
]
},
"id": "jFmhoTfNXh7e",
"outputId": "52c18d4a-db0f-489f-e7c8-ef20133b2446"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"Ranking by BM25...: 0%| | 0/4 [00:00<?, ? docs/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "3f60a91b4d2a44199e401de42889b087"
}
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['The official language of the Republic of Rose Island is Esperanto.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 115,\n",
" 'completion_tokens': 13,\n",
" 'total_tokens': 128}}]}}"
]
},
"metadata": {},
"execution_count": 16
}
]
},
{
"cell_type": "code",
"source": [
"pipe.draw(\"simple-rag-pipeline.png\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XkS0ozuZXhfV",
"outputId": "db0fd7b8-28ca-4ff0-931d-a908de8ab6ea"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'retriever': \"query: InputSocket(name='query', type=<class 'str'>, is_optional=False, sender=None)\\nfilters: InputSocket(name='filters', type=typing.Optional[typing.Dict[str, typing.Any]], is_optional=True, sender=None)\\ntop_k: InputSocket(name='top_k', type=typing.Optional[int], is_optional=True, sender=None)\\nscale_score: InputSocket(name='scale_score', type=typing.Optional[bool], is_optional=True, sender=None)\", 'prompt_builder': \"country: InputSocket(name='country', type=typing.Any, is_optional=False, sender=None)\\ndocuments: InputSocket(name='documents', type=typing.Any, is_optional=False, sender='retriever')\", 'llm': \"prompt: InputSocket(name='prompt', type=<class 'str'>, is_optional=False, sender='prompt_builder')\"}\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Going to production"
],
"metadata": {
"id": "iGg33w2pmHr6"
}
},
{
"cell_type": "markdown",
"source": [
"## Setting up Elasticsearch 8 (MANUAL STEPS REQUIRED)"
],
"metadata": {
"id": "nHF3oUcUznyn"
}
},
{
"cell_type": "code",
"source": [
"%%bash\n",
"\n",
"rm -rf elasticsearch*\n",
"wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.8.0-linux-x86_64.tar.gz\n",
"tar -xzf elasticsearch-8.8.0-linux-x86_64.tar.gz\n",
"sudo chown -R daemon:daemon elasticsearch-8.8.0/\n",
"umount /sys/fs/cgroup\n",
"apt install cgroup-tools"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "S6EuX-CDnBZh",
"outputId": "4e27bfce-76f3-45e5-9cfd-6ed460d621a5"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Reading package lists...\n",
"Building dependency tree...\n",
"Reading state information...\n",
"cgroup-tools is already the newest version (2.0-2).\n",
"0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"umount: /sys/fs/cgroup: not mounted.\n",
"\n",
"WARNING: apt does not have a stable CLI interface. Use with caution in scripts.\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%%bash --bg\n",
"\n",
"sudo -H -u daemon elasticsearch-8.8.0/bin/elasticsearch"
],
"metadata": {
"id": "zefnKPrawP0S"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# This part is important, since it takes some time for instance to load\n",
"import os\n",
"import time\n",
"time.sleep(60)"
],
"metadata": {
"id": "xaAAUNrywRgx"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!ps -ef | grep elastic"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nOOr_HbFwSoa",
"outputId": "231d5952-180e-454b-e4c1-07505af761ec"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"root 4580 4578 0 16:23 ? 00:00:00 sudo -H -u daemon elasticsearch-8.8.0/bin/elasti\n",
"daemon 4581 4580 23 16:23 ? 00:00:14 /content/elasticsearch-8.8.0/jdk/bin/java -Xms4m\n",
"daemon 4677 4581 99 16:23 ? 00:00:59 /content/elasticsearch-8.8.0/jdk/bin/java -Des.n\n",
"daemon 4721 4677 0 16:24 ? 00:00:00 /content/elasticsearch-8.8.0/modules/x-pack-ml/p\n",
"root 4941 1613 0 16:24 ? 00:00:00 /bin/bash -c ps -ef | grep elastic\n",
"root 4943 4941 0 16:24 ? 00:00:00 grep elastic\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Type 'y' when requested\n",
"\n",
"!/content/elasticsearch-8.8.0/bin/elasticsearch-setup-passwords auto -url \"https://localhost:9200\""
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1JmlziR8wWYD",
"outputId": "a4095f5d-0af5-45fa-d4f4-239518025fcc"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"******************************************************************************\n",
"Note: The 'elasticsearch-setup-passwords' tool has been deprecated. This command will be removed in a future release.\n",
"******************************************************************************\n",
"\n",
"Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user.\n",
"The passwords will be randomly generated and printed to the console.\n",
"Please confirm that you would like to continue [y/N]y\n",
"\n",
"\n",
"Changed password for user apm_system\n",
"PASSWORD apm_system = dh1afT35I7HbiiOsR2ba\n",
"\n",
"Changed password for user kibana_system\n",
"PASSWORD kibana_system = t6Xk8VOyjxrZ2NdspoFX\n",
"\n",
"Changed password for user kibana\n",
"PASSWORD kibana = t6Xk8VOyjxrZ2NdspoFX\n",
"\n",
"Changed password for user logstash_system\n",
"PASSWORD logstash_system = sxU9jqgAzexwJeqDbEHc\n",
"\n",
"Changed password for user beats_system\n",
"PASSWORD beats_system = 6OeRkxptrsVqKuwYtYPU\n",
"\n",
"Changed password for user remote_monitoring_user\n",
"PASSWORD remote_monitoring_user = CwoIxBqhtZPmKM5W13P3\n",
"\n",
"Changed password for user elastic\n",
"PASSWORD elastic = jSWt3HNS0ybYsi3PoDd4\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Copy the string at the bottom of the output and export it as an env var. It looks like this:\n",
"#\n",
"# Changed password for user elastic\n",
"# PASSWORD elastic = zkjH4RYRZbUjJk9xUYEV\n",
"#\n",
"# You need this password in the cell below as well.\n",
"\n",
"os.environ[\"ELASTICSEARCH_PASSWORD\"] = \"jSWt3HNS0ybYsi3PoDd4\""
],
"metadata": {
"id": "QzLZGbKhwjWk"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Test the password: copy it in the field when requested and check if the request succeeds\n",
"!curl --cacert /content/elasticsearch-8.8.0/config/certs/http_ca.crt -u elastic -H 'Content-Type: application/json' -XGET https://localhost:9200/?pretty=true"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fb_byjShwfR1",
"outputId": "fb489fd5-e05b-43d8-c1d1-89b42fd3729c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Enter host password for user 'elastic':\n",
"{\n",
" \"name\" : \"97ad19bfcffb\",\n",
" \"cluster_name\" : \"elasticsearch\",\n",
" \"cluster_uuid\" : \"gMGCyghxTNSQkbCx4EYXaQ\",\n",
" \"version\" : {\n",
" \"number\" : \"8.8.0\",\n",
" \"build_flavor\" : \"default\",\n",
" \"build_type\" : \"tar\",\n",
" \"build_hash\" : \"c01029875a091076ed42cdb3a41c10b1a9a5a20f\",\n",
" \"build_date\" : \"2023-05-23T17:16:07.179039820Z\",\n",
" \"build_snapshot\" : false,\n",
" \"lucene_version\" : \"9.6.0\",\n",
" \"minimum_wire_compatibility_version\" : \"7.17.0\",\n",
" \"minimum_index_compatibility_version\" : \"7.0.0\"\n",
" },\n",
" \"tagline\" : \"You Know, for Search\"\n",
"}\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Install the Haystack Elasticsearch integration"
],
"metadata": {
"id": "MgGM2qKXzvUA"
}
},
{
"cell_type": "code",
"source": [
"%pip install \"git+https://github.com/deepset-ai/haystack-core-integrations.git@a052341#egg=elasticsearch-haystack&subdirectory=document_stores/elasticsearch\""
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "xNm-mSN8mPFd",
"outputId": "e1901984-7c42-4896-94d9-ea0bc49d568d"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting elasticsearch-haystack\n",
" Cloning https://github.com/deepset-ai/haystack-core-integrations.git (to revision a052341) to /tmp/pip-install-zmcx3d3y/elasticsearch-haystack_e7db19db19e9439399309d044144a338\n",
" Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack-core-integrations.git /tmp/pip-install-zmcx3d3y/elasticsearch-haystack_e7db19db19e9439399309d044144a338\n",
"\u001b[33m WARNING: Did not find branch or tag 'a052341', assuming revision or ref.\u001b[0m\u001b[33m\n",
"\u001b[0m Running command git checkout -q a052341\n",
" Resolved https://github.com/deepset-ai/haystack-core-integrations.git to commit a052341\n",
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
"Collecting elasticsearch<9,>=8 (from elasticsearch-haystack)\n",
" Downloading elasticsearch-8.10.1-py3-none-any.whl (409 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m409.3/409.3 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: haystack-ai in /usr/local/lib/python3.10/dist-packages (from elasticsearch-haystack) (0.88.0)\n",
"Collecting elastic-transport<9,>=8 (from elasticsearch<9,>=8->elasticsearch-haystack)\n",
" Downloading elastic_transport-8.4.1-py3-none-any.whl (59 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.5/59.5 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: canals==0.8.1 in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (0.8.1)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (3.1.2)\n",
"Requirement already satisfied: lazy-imports in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (0.3.1)\n",
"Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (10.1.0)\n",
"Requirement already satisfied: openai in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (0.28.1)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (1.5.3)\n",
"Requirement already satisfied: posthog in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (3.0.2)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (6.0.1)\n",
"Requirement already satisfied: rank-bm25 in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (0.2.2)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (2.31.0)\n",
"Requirement already satisfied: tenacity in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (8.2.3)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from haystack-ai->elasticsearch-haystack) (4.66.1)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from canals==0.8.1->haystack-ai->elasticsearch-haystack) (3.2)\n",
"Collecting urllib3<2,>=1.26.2 (from elastic-transport<9,>=8->elasticsearch<9,>=8->elasticsearch-haystack)\n",
" Downloading urllib3-1.26.18-py2.py3-none-any.whl (143 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.8/143.8 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch<9,>=8->elasticsearch-haystack) (2023.7.22)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->haystack-ai->elasticsearch-haystack) (2.1.3)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from openai->haystack-ai->elasticsearch-haystack) (3.8.6)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai->elasticsearch-haystack) (3.3.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai->elasticsearch-haystack) (3.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai->elasticsearch-haystack) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai->elasticsearch-haystack) (2023.3.post1)\n",
"Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai->elasticsearch-haystack) (1.23.5)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai->elasticsearch-haystack) (1.16.0)\n",
"Requirement already satisfied: monotonic>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai->elasticsearch-haystack) (1.6)\n",
"Requirement already satisfied: backoff>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai->elasticsearch-haystack) (2.2.1)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai->elasticsearch-haystack) (23.1.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai->elasticsearch-haystack) (6.0.4)\n",
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai->elasticsearch-haystack) (4.0.3)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai->elasticsearch-haystack) (1.9.2)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai->elasticsearch-haystack) (1.4.0)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->haystack-ai->elasticsearch-haystack) (1.3.1)\n",
"Building wheels for collected packages: elasticsearch-haystack\n",
" Building wheel for elasticsearch-haystack (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for elasticsearch-haystack: filename=elasticsearch_haystack-0.0.1-py3-none-any.whl size=13070 sha256=1d9195a2b19e7e10bfa820f48bd05bf7d8b8c52cea15016e7d1610b893683490\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-96fel4z0/wheels/4c/ac/c7/efe7ab14a97ebf78312666b6273ee9509cd9bc8d59bfb9e142\n",
"Successfully built elasticsearch-haystack\n",
"Installing collected packages: urllib3, elastic-transport, elasticsearch, elasticsearch-haystack\n",
" Attempting uninstall: urllib3\n",
" Found existing installation: urllib3 2.0.7\n",
" Uninstalling urllib3-2.0.7:\n",
" Successfully uninstalled urllib3-2.0.7\n",
"Successfully installed elastic-transport-8.4.1 elasticsearch-8.10.1 elasticsearch-haystack-0.0.1 urllib3-1.26.18\n"
]
},
{
"output_type": "display_data",
"data": {
"application/vnd.colab-display-data+json": {
"pip_warning": {
"packages": [
"urllib3"
]
}
}
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"source": [
"## Build the pipeline"
],
"metadata": {
"id": "hTTmopFX-jse"
}
},
{
"cell_type": "code",
"source": [
"from elasticsearch_haystack.document_store import ElasticsearchDocumentStore\n",
"\n",
"# Get the host where Elasticsearch is running, default to localhost\n",
"host = os.environ.get(\"ELASTICSEARCH_HOST\", \"https://localhost:9200\")\n",
"user = \"elastic\"\n",
"pwd = os.environ[\"ELASTICSEARCH_PASSWORD\"] # If this fails, make sure you uncommented the relevant lines during ES setup.\n",
"\n",
"docstore = ElasticsearchDocumentStore(hosts=[host], basic_auth=(user, pwd), ca_certs=\"/content/elasticsearch-8.8.0/config/certs/http_ca.crt\")"
],
"metadata": {
"id": "QRVh5wAHnxJz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from haystack.preview.document_stores import DuplicatePolicy\n",
"documents = [\n",
" Document(text=\"German is the the official language of Germany.\"),\n",
" Document(text=\"The capital of France is Paris, and its official language is French.\"),\n",
" Document(text=\"Italy recognizes a few official languages, but the most widespread one is Italian.\"),\n",
" Document(text=\"Esperanto has been adopted as official language for some microstates as well, such as the Republic of Rose Island, a short-lived microstate built on a sea platform in the Adriatic Sea.\")\n",
"]\n",
"docstore.write_documents(documents=documents, policy=DuplicatePolicy.OVERWRITE)\n",
"\n",
"docstore.filter_documents()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "lAlQ0wiL1RfN",
"outputId": "bc12f7b9-8b1b-4822-9e87-975218ebcffa"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[Document(id='e2afcf2a41f8075f26cfd0c125327306fcedf7e8633c9f381a9622e0708f9e59', text='German is the the official language of Germany.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=1.0),\n",
" Document(id='08e67d3889bc91771ba65eeea4df53269a3bb6d0bbe483593b4364ef34867a6b', text='The capital of France is Paris, and its official language is French.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=1.0),\n",
" Document(id='e13f61547d3505967902abf4e9aeaf2bd4ec853feca6681676f66773493f3cb5', text='Italy recognizes a few official languages, but the most widespread one is Italian.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=1.0),\n",
" Document(id='cc929738998c7cfc223fefb396c3304906e774c85622ef8861ceb06de9acb02e', text='Esperanto has been adopted as official language for some microstates as well, such as the Republic of Rose Island, a short-lived microstate built on a sea platform in the Adriatic Sea.', array=None, dataframe=None, blob=None, mime_type='text/plain', metadata={}, id_hash_keys=['text', 'array', 'dataframe', 'blob'], score=1.0)]"
]
},
"metadata": {},
"execution_count": 45
}
]
},
{
"cell_type": "code",
"source": [
"from elasticsearch_haystack.bm25_retriever import ElasticsearchBM25Retriever\n",
"\n",
"template = \"\"\"\n",
"Given the following information, answer the question.\n",
"\n",
"Context:\n",
"{% for document in documents %}\n",
" {{ document.text }}\n",
"{% endfor %}\n",
"\n",
"Question: What's the official language of {{ country }}?\n",
"\"\"\"\n",
"\n",
"pipe = Pipeline()\n",
"pipe.add_component(\"retriever\", ElasticsearchBM25Retriever(document_store=docstore))\n",
"pipe.add_component(\"prompt_builder\", PromptBuilder(template=template))\n",
"pipe.add_component(\"llm\", GPTGenerator(api_key=api_key))\n",
"pipe.connect(\"retriever\", \"prompt_builder.documents\")\n",
"pipe.connect(\"prompt_builder\", \"llm\")\n",
"\n",
"pipe.draw(\"elasticsearch-rag-pipeline.png\")\n",
"\n",
"country = \"the Republic of Rose Island\"\n",
"pipe.run({\n",
" \"retriever\": {\"query\": country},\n",
" \"prompt_builder\": {\"country\": country}\n",
"})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "rZhA_rK51EoJ",
"outputId": "b25121f3-35c0-409e-91f6-b6bc99468853"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'retriever': \"query: InputSocket(name='query', type=<class 'str'>, is_optional=False, sender=None)\", 'prompt_builder': \"country: InputSocket(name='country', type=typing.Any, is_optional=False, sender=None)\\ndocuments: InputSocket(name='documents', type=typing.Any, is_optional=False, sender='retriever')\", 'llm': \"prompt: InputSocket(name='prompt', type=<class 'str'>, is_optional=False, sender='prompt_builder')\"}\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'llm': {'replies': ['The official language of the Republic of Rose Island is Esperanto.'],\n",
" 'metadata': [{'model': 'gpt-3.5-turbo-0613',\n",
" 'index': 0,\n",
" 'finish_reason': 'stop',\n",
" 'usage': {'prompt_tokens': 74,\n",
" 'completion_tokens': 13,\n",
" 'total_tokens': 87}}]}}"
]
},
"metadata": {},
"execution_count": 46
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment