Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created October 14, 2021 17:19
Show Gist options
  • Save pszemraj/92ae15c4b8c66c003d652b10c4a0b2aa to your computer and use it in GitHub Desktop.
Save pszemraj/92ae15c4b8c66c003d652b10c4a0b2aa to your computer and use it in GitHub Desktop.
DM1-2021hs-info-extraction [v2].ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"accelerator": "GPU",
"colab": {
"name": "DM1-2021hs-info-extraction [v2].ipynb",
"provenance": [],
"collapsed_sections": [
"NK7daXR_cAU1",
"d3yqOW7zbKHJ",
"e46AMZxiVIWr",
"X4Y4XecAvjhF",
"Tib_vY_9vpYz",
"iZSEr6edwHf7",
"gnkGwlBGazgG"
],
"toc_visible": true,
"machine_shape": "hm",
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"8b4925f18b6b4dd3a78b36831bdfd794": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_b1b4d0313c6b4050b265233ca11f85ae",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_7ee55c7f902e4d36934566d2674b44c7",
"IPY_MODEL_50f2ca2a585b42d8804c87178caa1a3c",
"IPY_MODEL_c8d01e6e25384ce2aa88efa410713458"
]
}
},
"b1b4d0313c6b4050b265233ca11f85ae": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"7ee55c7f902e4d36934566d2674b44c7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_a9c43c6632f34a1ea583f8ba287abffb",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_d53bb00fa62a45ff8fc944d2e8fe9f8e"
}
},
"50f2ca2a585b42d8804c87178caa1a3c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_11e5674ca6664ea79fc77c535a95b28c",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1143,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1143,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_5a1040d9b7334915aba96a9ee5601604"
}
},
"c8d01e6e25384ce2aa88efa410713458": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_8edef780b03e42a39233b29f9d1b5f84",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1.14k/1.14k [00:00<00:00, 45.4kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_4dc52a2662ff4dbda5f2aea33bcbf059"
}
},
"a9c43c6632f34a1ea583f8ba287abffb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"d53bb00fa62a45ff8fc944d2e8fe9f8e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"11e5674ca6664ea79fc77c535a95b28c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"5a1040d9b7334915aba96a9ee5601604": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"8edef780b03e42a39233b29f9d1b5f84": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"4dc52a2662ff4dbda5f2aea33bcbf059": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"f574a7ca3c614eebb4a95005541f4222": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_d9ac1c6647474271abc4d9a36e1291ac",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_f7f8bace323d4d6e82d50d905e220b20",
"IPY_MODEL_4f91e1b0b1a243b2b2412470cebbac95",
"IPY_MODEL_3a450da0c4894902b64fe096669b7430"
]
}
},
"d9ac1c6647474271abc4d9a36e1291ac": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"f7f8bace323d4d6e82d50d905e220b20": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_55686fe0a21944cfb3b8c8a3d2a8701b",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_32e9704140064e6eaf4177a7f259e2d4"
}
},
"4f91e1b0b1a243b2b2412470cebbac95": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_7e8a917ce52040fa83dc5c9f55dd47f8",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1912529,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1912529,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_4ac96dfb30464045999928f8b8ad84a2"
}
},
"3a450da0c4894902b64fe096669b7430": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_fcf19025287544e59597bd34cd00e621",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1.91M/1.91M [00:00<00:00, 2.92MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_326b5113687d4d3b8880fc84392dca42"
}
},
"55686fe0a21944cfb3b8c8a3d2a8701b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"32e9704140064e6eaf4177a7f259e2d4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"7e8a917ce52040fa83dc5c9f55dd47f8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"4ac96dfb30464045999928f8b8ad84a2": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"fcf19025287544e59597bd34cd00e621": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"326b5113687d4d3b8880fc84392dca42": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"69fecc3080824a98835affb58b00d52b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_368440c926b34b418208cc5095cdbec8",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_2cdc539883ec402993404c0291168811",
"IPY_MODEL_fd46ef5261a143b7b7f500a3f2aa9698",
"IPY_MODEL_72b2887e929b4fa8b6a7312bf1c88f34"
]
}
},
"368440c926b34b418208cc5095cdbec8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"2cdc539883ec402993404c0291168811": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_ca439e33b100451ea2874ab2e43c6812",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_eb842c1fc52c40b4aabcd888a05ca995"
}
},
"fd46ef5261a143b7b7f500a3f2aa9698": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_3b8bb1707e544af0b01d078ab8845af9",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 65,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 65,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_9cedd17dc11f44599d8bec369ccd9ae6"
}
},
"72b2887e929b4fa8b6a7312bf1c88f34": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_9579fd27034f4fa2bd3d8ead02a287c4",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 65.0/65.0 [00:00<00:00, 2.59kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c36a5d0edb8a4cb89b07a9c7f9a64878"
}
},
"ca439e33b100451ea2874ab2e43c6812": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"eb842c1fc52c40b4aabcd888a05ca995": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"3b8bb1707e544af0b01d078ab8845af9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"9cedd17dc11f44599d8bec369ccd9ae6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9579fd27034f4fa2bd3d8ead02a287c4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"c36a5d0edb8a4cb89b07a9c7f9a64878": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"c61fa952578944b7a6e31b1a85ed9136": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_661d296202464897b8980b715c896445",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_65a71be7d9424158b52ecb8b6c28277a",
"IPY_MODEL_dab18c875b394056abfcd862e1d2fd82",
"IPY_MODEL_ed9814df69bc482692b023164acb0370"
]
}
},
"661d296202464897b8980b715c896445": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"65a71be7d9424158b52ecb8b6c28277a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_4a1a6b54dade477a9319c7ee9fc4b3e4",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_d3dc35eeda414597b93333b72ea1ada7"
}
},
"dab18c875b394056abfcd862e1d2fd82": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_569cf4c72fda4667a0d782412fa0065f",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 87,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 87,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_e2a0d47e4de94d689ce9fb3ea27cbd3b"
}
},
"ed9814df69bc482692b023164acb0370": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_308964c16df440ca85e79170c19ce5be",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 87.0/87.0 [00:00<00:00, 3.48kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_53e23ddcf3e04ead833155c1dcf27e6c"
}
},
"4a1a6b54dade477a9319c7ee9fc4b3e4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"d3dc35eeda414597b93333b72ea1ada7": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"569cf4c72fda4667a0d782412fa0065f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"e2a0d47e4de94d689ce9fb3ea27cbd3b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"308964c16df440ca85e79170c19ce5be": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"53e23ddcf3e04ead833155c1dcf27e6c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"724812ffa9704cb98bef6dd2cc1832cf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_5f497ad6fcb540db8ceac9b072fe4782",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_ec25e2f20b7b4693a38ab1ce762ee4ea",
"IPY_MODEL_bacf27bfc94d47f8821e8c790c877852",
"IPY_MODEL_0d6bbe7f480e46778a1dacf8a110f041"
]
}
},
"5f497ad6fcb540db8ceac9b072fe4782": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"ec25e2f20b7b4693a38ab1ce762ee4ea": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_b673b94f774743ef9b18773bb68b4718",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "answering questions...: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c3d626747bb54178a86537b8b6fcf807"
}
},
"bacf27bfc94d47f8821e8c790c877852": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_345a35e102c144188b64ff73fe597e72",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 15,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 15,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_5ca8dd47bc4344e8b9251327b914e4d3"
}
},
"0d6bbe7f480e46778a1dacf8a110f041": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_faa7eab29349446f8b8f29022c3af7dc",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 15/15 [03:35<00:00, 14.40s/it]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_e5e84104a3a54cfa9be4c21c58cdb3df"
}
},
"b673b94f774743ef9b18773bb68b4718": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"c3d626747bb54178a86537b8b6fcf807": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"345a35e102c144188b64ff73fe597e72": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"5ca8dd47bc4344e8b9251327b914e4d3": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"faa7eab29349446f8b8f29022c3af7dc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"e5e84104a3a54cfa9be4c21c58cdb3df": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b66e639d33964b909d78307b964b65fd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_a031cd7bce7544d7a2ce54d3bff73cbb",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_72b3bdb9a04c4a9aa9fd81f2d9e6a662",
"IPY_MODEL_dfb0cf603bf94789880919363c8521c9",
"IPY_MODEL_bb90b97c0239499aa2e074b8b23afa90"
]
}
},
"a031cd7bce7544d7a2ce54d3bff73cbb": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"72b3bdb9a04c4a9aa9fd81f2d9e6a662": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_634b387cb25d4582ba23976c04a6e8ed",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_70d8fb74b163420b8e1aaaa2368f98cb"
}
},
"dfb0cf603bf94789880919363c8521c9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_68c585b41a9c4554af93c17530561707",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1291,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1291,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_0b3f4c76165a4a2aa5afcdace96b6255"
}
},
"bb90b97c0239499aa2e074b8b23afa90": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_4e01882dfc014bdb89e4ee2150e72306",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1.29k/1.29k [00:00<00:00, 52.0kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_bc375c8e893146ffb099077b6f86f1e8"
}
},
"634b387cb25d4582ba23976c04a6e8ed": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"70d8fb74b163420b8e1aaaa2368f98cb": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"68c585b41a9c4554af93c17530561707": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"0b3f4c76165a4a2aa5afcdace96b6255": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"4e01882dfc014bdb89e4ee2150e72306": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"bc375c8e893146ffb099077b6f86f1e8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5e3cea0100f44680a98b29ef506c1240": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_949b9cb8562249ad93015da60bcf0af4",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_5b82531520be4057919d8d430a945e87",
"IPY_MODEL_c3ad3251d7964845912e81bed6c4933f",
"IPY_MODEL_a4080cdae7314dbba708ce7a18416ea1"
]
}
},
"949b9cb8562249ad93015da60bcf0af4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5b82531520be4057919d8d430a945e87": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_7d7b45d8466a4c44bf60ce1a161be880",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_3488f4a6cc4c4f1997f97fc387ea3427"
}
},
"c3ad3251d7964845912e81bed6c4933f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_f5f274fcb4754de7be311d4c05ddd9eb",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1839633783,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1839633783,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_6dc6cb3ba8084721a0557e6b9f299cd2"
}
},
"a4080cdae7314dbba708ce7a18416ea1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_3c53c54187d14ea9b1981c837ea60290",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1.84G/1.84G [00:42<00:00, 39.5MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_b648b2fa5d0f47a585a2277f209d3b99"
}
},
"7d7b45d8466a4c44bf60ce1a161be880": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"3488f4a6cc4c4f1997f97fc387ea3427": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"f5f274fcb4754de7be311d4c05ddd9eb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"6dc6cb3ba8084721a0557e6b9f299cd2": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"3c53c54187d14ea9b1981c837ea60290": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"b648b2fa5d0f47a585a2277f209d3b99": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"0c0a7675aad74a748c35333306417836": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_b042e089cd224ccd8ec9ac21b78e5a55",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_a013a190a0794204877a1c4268739da2",
"IPY_MODEL_5c08be3930674a4db40bfc5fa9689b55",
"IPY_MODEL_3a9471cd4cf449768add43beb1116ffc"
]
}
},
"b042e089cd224ccd8ec9ac21b78e5a55": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"a013a190a0794204877a1c4268739da2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_baf5f95a331144449b8d956019d456af",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "getting defs for search_terms...: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_3fbbac66544b413d8b09a447774a875f"
}
},
"5c08be3930674a4db40bfc5fa9689b55": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_e3e5b903034042459eb19d84d86822c3",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 8,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 8,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_1f949b6cffce4ec9be310bb15bbe662e"
}
},
"3a9471cd4cf449768add43beb1116ffc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_7fff9a716ebf449888de5a08bd4eed66",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 8/8 [00:50<00:00, 6.37s/it]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_61758c7ae2094fec960c5c0533f65366"
}
},
"baf5f95a331144449b8d956019d456af": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"3fbbac66544b413d8b09a447774a875f": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"e3e5b903034042459eb19d84d86822c3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"1f949b6cffce4ec9be310bb15bbe662e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"7fff9a716ebf449888de5a08bd4eed66": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"61758c7ae2094fec960c5c0533f65366": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"3d810aed1b33438e99487304623a8907": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_bbc0ac56f30548f9b88421e0022142f4",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_54cdfba1ddb64823abf9d670fe8d1781",
"IPY_MODEL_e5901d65bb3445beb1109db43c36bf82",
"IPY_MODEL_eddc90cb8c244d7d97a7ce691641c6f2"
]
}
},
"bbc0ac56f30548f9b88421e0022142f4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"54cdfba1ddb64823abf9d670fe8d1781": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_e1fe58d4071a4f6ea3d7573108e6c039",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "getting defs for search_terms...: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_548828504bac43c18cd2a799ca510336"
}
},
"e5901d65bb3445beb1109db43c36bf82": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_9fdb181e7d3e47b99f4deb868fda0e85",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 26,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 26,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_27fc39e071ed41a1954d0e6c4ebf1e0e"
}
},
"eddc90cb8c244d7d97a7ce691641c6f2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_1db1e2bcdac04ed28c2aa1885b65f5fa",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 26/26 [02:46<00:00, 6.40s/it]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_1d1bc375ff9a4032a3b6a9438ba0b7f4"
}
},
"e1fe58d4071a4f6ea3d7573108e6c039": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"548828504bac43c18cd2a799ca510336": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9fdb181e7d3e47b99f4deb868fda0e85": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"27fc39e071ed41a1954d0e6c4ebf1e0e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"1db1e2bcdac04ed28c2aa1885b65f5fa": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"1d1bc375ff9a4032a3b6a9438ba0b7f4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/pszemraj/92ae15c4b8c66c003d652b10c4a0b2aa/dm1-2021hs-info-extraction-v2.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bEH-CRbeA6NU"
},
"source": [
"# <center> Analysis of Course Docs using HayStack </center>\n",
"\n",
"- *alternative title: Auto-TA*\n",
"- long form question answering (generative LFQA)\n",
"- <font color=\"goldenrod\">the most relevant sections are at the bottom: **Question-Answer Pipeline & Summarized Document Search**</font>\n",
"- adjust hyperparameters tagged with `decrease_if_crash ` if it crashes\n",
"\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "1UZobc1vJ_Vt"
},
"source": [
"course_name = \"data-mining\" # @param {type:\"string\"}"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "uPQ6IBiDeT_i"
},
"source": [
"## Purpose\n",
"\n",
"uses *Haystack* to perform analysis on a corpus of documents all from one class.\n",
"\n",
"#### References\n",
"- A [tutorial](https://towardsdatascience.com/ask-wikipedia-eli5-like-questions-using-long-form-question-answering-on-haystack-32cf1ca6c00e) from medium\n",
"- [haystack github](https://github.com/deepset-ai/haystack)\n",
"- [haystack api ref](https://haystack.deepset.ai/docs/latest/get_startedmd)\n",
"\n",
"### CUDA crash / troubleshooting \n",
"- <font color=\"salmon\"> info retrieval memory usage varies widely depending on the number of documents and associated hyperparameters for the models (there are two) and how the given models interact with each other (somewhat unpredictable)\n",
" - on top of that, the GPU Colab assigns the runtime is pseudo-random and obviously memory depends on the hardware\n",
" - if you run `!nvidia-smi` (see below) and your assigned GPU is not 16 gb, you almost definitely will need to change some things\n",
"- running into a couple CUDA issues while figuring out what works is normal. If it crashes, decrease some key hyperparameters or change the model type used, and try again\n",
"- hyperparameters that are recommended to be adjusted first have been tagged with the comment `decrease_if_crash` in this document. You can search for this tag with the \"find\" functionality (I think the standard shortcut is `CTRL+H` in Colab)\n",
" - typically the very first thing to change should be `n_beams` or `number_beam_search`\n",
"- if I need to take a look at it and the above things have obviously not been investigated.. 😤</font>\n",
"\n",
"\n",
"### adapting this notebook\n",
"\n",
"1. main pre-requisite is that you have all relevant documents as .txt zipped together.\n",
" - convert audio to text using [vid2cleantxt](https://github.com/pszemraj/vid2cleantxt) or other, PDFs/images can be converted via [OCR](https://github.com/JaidedAI/EasyOCR)\n",
"2. the library has not been tested with PDF (by me, at least) but in theory should work - review haystack API docs *linked above* for what they have on PDF\n",
"3. upload your document online somewhere that the .zip can be retried by the `requests` library and then update `URL_to_archive`\n",
"4. update questions as needed at the bottom of this notebook\n",
"5. update the models as needed once more comfortable with haystack API (for example, can use text gen trained on ELI5)\n",
"\n",
"\n",
"*by [Peter Szemraj](https://github.com/pszemraj)*\n",
"\n",
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "lqaAS0E-l2h3"
},
"source": [
"## <font color=\"yellow\"> TODO List / Future Work </font>\n",
"\n",
"1. create a new version of this notebook that loads a finished document store item rather than processing all the things\n",
"2. ~~update the question-answer function to save questions+answers as dataframe cols~~\n",
"2. ~~create a custom term-definition function that uses the summarization pipeline and saves to dropbox. should save as a dataframe~~\n",
"3. ~~try `google/t5-large-ssm-nq` and `google/t5-large-ssm-nqo` - [card](https://huggingface.co/google/t5-large-ssm-nqo)~~\n",
"3. ~~try to use `vasudevgupta/bigbird-pegasus-large-bigpatent` for question answering. [model card](vasudevgupta/bigbird-pegasus-large-bigpatent) | [link to colab notebook copy](https://colab.research.google.com/drive/1fR2YHg-UWZ7U8CwF8-hmHsvBr-PseVdW?usp=sharing)~~\n",
"3. Are variants of `EmbeddingRetriever()` with some good text transformer better than `DensePassageRetriever`?\n",
"4. are there other preprocessors besides `haystack.preprocessor.cleaning import clean_wiki_text`\n",
"4. Enable document store cosine similarity with `milvus` document store type *which is needlessly complicated to use on Colab*\n",
"\n",
"### other models to test\n",
"\n",
"- `jpelhaw/t5-word-sense-disambiguation`\n",
"- ~~`castorini/monot5-base-msmarco`~~\n",
"- valhalla [link](https://huggingface.co/valhalla/distilt5-qa-qg-hl-12-6)\n",
"\n",
"---\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "NK7daXR_cAU1"
},
"source": [
"## formatting"
]
},
{
"cell_type": "code",
"metadata": {
"id": "XztfEh86cDCR"
},
"source": [
"from IPython.display import HTML, display\n",
"# colab formatting\n",
"def set_css():\n",
" display(\n",
" HTML(\n",
" \"\"\"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" \"\"\"\n",
" )\n",
" )\n",
"\n",
"get_ipython().events.register(\"pre_run_cell\", set_css)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "e79Gn7wCP-Dm"
},
"source": [
"## VM info"
]
},
{
"cell_type": "code",
"metadata": {
"id": "JlZgP8q1A6NW",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 370
},
"outputId": "3ddec957-89d7-43ce-b9fe-34696eb183cd"
},
"source": [
"# Make sure you have a GPU running that is 12 gb or greater\n",
"!nvidia-smi\n",
"# Runtime -> Change Runtime Type -> GPU + High-Ram if it lets you"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Wed Oct 13 00:14:48 2021 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 470.74 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 A100-SXM4-40GB Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 39C P0 44W / 400W | 0MiB / 40536MiB | 0% Default |\n",
"| | | Disabled |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "9rIqlIHeP_Vg",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "c01e30cf-0f7f-4a0f-c547-a1580d4d9dbd"
},
"source": [
"from psutil import virtual_memory\n",
"import os\n",
"ram_gb = round(virtual_memory().total / (1024 ** 3), 1)\n",
"print(\"Runtime has {} gigs of memory and {} processors\".format(ram_gb, os.cpu_count()))"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Runtime has 83.5 gigs of memory and 12 processors\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wIVP-NAqNsn7"
},
"source": [
"# setup"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 211
},
"id": "g5KM0loS1JhU",
"outputId": "bee77d39-544a-4e61-a37a-61927ee0fa2a"
},
"source": [
"!pip3 install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
"Requirement already satisfied: torch==1.9.1+cu111 in /usr/local/lib/python3.7/dist-packages (1.9.1+cu111)\n",
"Requirement already satisfied: torchvision==0.10.1+cu111 in /usr/local/lib/python3.7/dist-packages (0.10.1+cu111)\n",
"Requirement already satisfied: torchaudio==0.9.1 in /usr/local/lib/python3.7/dist-packages (0.9.1)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch==1.9.1+cu111) (3.7.4.3)\n",
"Requirement already satisfied: pillow>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from torchvision==0.10.1+cu111) (8.3.2)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from torchvision==0.10.1+cu111) (1.19.5)\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "kkPbArxONrQu",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "2a924405-c5ec-4c84-9007-970c5ad0a411"
},
"source": [
"%%capture\n",
"# this apparently makes the docstore faster\n",
"!git clone https://github.com/NVIDIA/apex\n",
"!cd apex\n",
"!pip install -v --disable-pip-version-check --no-cache-dir --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\" ./"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Hly5Ul1PR953"
},
"source": [
"<font color=\"salmon\"> *NOTE if Colab crashes while installing packages restart + run all cells (factory reset **not** required)* </font>"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"id": "0KKYvfSXjpzq",
"outputId": "e47ea5e3-6782-40b1-a5c3-04a5b3d74cd2"
},
"source": [
"%%capture\n",
"# Install the latest release of Haystack in your own environment\n",
"\n",
"# !pip install farm-haystack # used to have this commented out, may need to again \n",
"\n",
"# Install the latest master of Haystack\n",
"!pip install grpcio-tools==1.34.1\n",
"!pip install --upgrade git+https://github.com/deepset-ai/haystack.git\n",
"\n",
"# Install pygraphviz\n",
"!apt install libgraphviz-dev\n",
"!pip install pygraphviz\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "NM36kbRFA6Nc",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "957b90e8-ed22-48a3-eff0-fd0b1de76b33"
},
"source": [
"%%capture\n",
"\n",
"!pip install -U tqdm\n",
"!pip install -U dropbox\n",
"!pip install -U unidecode\n",
"!pip install -U clean-text\n",
"!pip install -U wordninja\n",
"!pip install 'ray[default]' # because of warning during imports\n",
"\n",
"import wordninja\n",
"from cleantext import clean\n",
"from tqdm.auto import tqdm\n",
"import dropbox\n",
"import joblib"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "xmRuhTQ7A6Nh",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "97922016-97f8-414e-c532-0c415d0b755a"
},
"source": [
"import gc\n",
"import pprint as pp\n",
"from datetime import datetime\n",
"\n",
"# NOTE if Colab fails while importing packages restart + run all cells (factory reset **not** required)\n",
"from haystack.preprocessor.cleaning import clean_wiki_text\n",
"from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http\n",
"from haystack.generator.transformers import Seq2SeqGenerator\n",
"from haystack.preprocessor.preprocessor import PreProcessor\n",
"from haystack.document_store.faiss import FAISSDocumentStore\n",
"from haystack.generator.transformers import RAGenerator\n",
"from haystack.retriever.dense import DensePassageRetriever\n",
"import sentencepiece\n",
"import transformers"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "d3yqOW7zbKHJ"
},
"source": [
"## misc utility functions"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Pq2mysRObLp1",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "418008d1-8452-427c-e324-a8270ff52388"
},
"source": [
"from IPython.display import clear_output\n",
"\n",
"# https://ipython.readthedocs.io/en/stable/api/generated/IPython.display.html\n",
"def isnotebook():\n",
" try:\n",
" shell = get_ipython().__class__.__name__\n",
" if shell == \"ZMQInteractiveShell\":\n",
" return True # Jupyter notebook or qtconsole\n",
" elif shell == \"Shell\":\n",
" return True # Colab\n",
" elif shell == \"TerminalInteractiveShell\":\n",
" return False # Terminal running IPython\n",
" else:\n",
" return False # Other type (?)\n",
" except NameError:\n",
" return False # Probably standard Python interpreter\n",
"\n",
"\n",
"def clear_jupyter_cell():\n",
" is_jupyter = isnotebook()\n",
"\n",
" if is_jupyter:\n",
" clear_output(wait=False)\n",
" else:\n",
" print(\"not in a jupyter notebook\")"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ZKFlJq0qfp5b",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "5e6f253a-004e-456e-a7da-b77bf0bcb6c6"
},
"source": [
"# basic SC\n",
"import re\n",
"\n",
"\n",
"def remove_string_extras(mytext):\n",
" # removes everything from a string except A-Za-z0-9 .,;\n",
" return re.sub(r\"[^A-Za-z0-9 _.,;]+\", \"\", mytext)\n",
"\n",
"\n",
"def corr(s):\n",
" # adds space after period if there isn't one\n",
" # removes extra spaces\n",
" return re.sub(r\"\\.(?! )\", \". \", re.sub(r\" +\", \" \", s))"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Ipq9Dq1wRbrH",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "6a0328e3-5565-4a57-a5a8-7dd4dd95522a"
},
"source": [
"def clean_output(ugly_text, txt_lan=\"en\"):\n",
" # a wrapper for clean text with options different than default\n",
"\n",
" # https://pypi.org/project/clean-text/\n",
" cleaned_text = clean(\n",
" ugly_text,\n",
" fix_unicode=True, # fix various unicode errors\n",
" to_ascii=True, # transliterate to closest ASCII representation\n",
" lower=False, # lowercase text\n",
" no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them\n",
" no_urls=True, # replace all URLs with a special token\n",
" no_emails=True, # replace all email addresses with a special token\n",
" no_phone_numbers=True, # replace all phone numbers with a special token\n",
" no_numbers=False, # replace all numbers with a special token\n",
" no_digits=False, # replace all digits with a special token\n",
" no_currency_symbols=True, # replace all currency symbols with a special token\n",
" no_punct=False, # remove punctuations\n",
" replace_with_punct=\"\", # instead of removing punctuations you may replace them\n",
" replace_with_url=\"<URL>\",\n",
" replace_with_email=\"<EMAIL>\",\n",
" replace_with_phone_number=\"<PHONE>\",\n",
" replace_with_number=\"<NUM>\",\n",
" replace_with_digit=\"0\",\n",
" replace_with_currency_symbol=\"<CUR>\",\n",
" lang=txt_lan, # set to 'de' for German special handling\n",
" )\n",
"\n",
" return cleaned_text"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "jBg44J-cedxd",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "f8e09680-5ad0-442b-8dc7-b212388f8e39"
},
"source": [
"def beautify_filename(filename, num_words=20, start_reverse=False, word_separator=\"_\"):\n",
" # takes a filename stored as text, removes extension, separates into X words ...\n",
" # and returns a nice filename with the words separateed by\n",
" # useful for when you are reading files, doing things to them, and making new files\n",
"\n",
" filename = str(filename)\n",
" index_file_Ext = filename.rfind(\".\")\n",
" current_name = str(filename)[:index_file_Ext] # get rid of extension\n",
" if current_name[-1].isnumeric():\n",
" current_name = current_name + \"s\"\n",
" clean_name = clean_output(current_name)\n",
" file_words = wordninja.split(clean_name)\n",
" # splits concatenated text into a list of words based on common word freq\n",
" if len(file_words) <= num_words:\n",
" num_words = len(file_words)\n",
"\n",
" if start_reverse:\n",
" t_file_words = file_words[-num_words:]\n",
" else:\n",
" t_file_words = file_words[:num_words]\n",
"\n",
" pretty_name = word_separator.join(t_file_words) # see function argument\n",
"\n",
" # NOTE IT DOES NOT RETURN THE EXTENSION\n",
" return pretty_name[\n",
" : (len(pretty_name) - 1)\n",
" ] # there is a space always at the end, so -1"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "y_fvs3GoRnr8"
},
"source": [
"### download & extract zipped folder"
]
},
{
"cell_type": "code",
"metadata": {
"id": "k9j223dKRif-",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "2165c4f9-bdf5-4053-94b0-772592b4950e"
},
"source": [
"import re, os, shutil\n",
"\n",
"\n",
"def URL_string_filter(text):\n",
" custom_printable = (\n",
" \"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._\"\n",
" )\n",
"\n",
" filtered = \"\".join((filter(lambda i: i in custom_printable, text)))\n",
"\n",
" return filtered\n",
"\n",
"\n",
"def getFilename_fromCd(cd):\n",
"\n",
" if not cd:\n",
" return None\n",
" fname = re.findall(\"filename=(.+)\", cd)\n",
" if len(fname) > 0:\n",
" output = fname[0]\n",
" elif cd.find(\"/\"):\n",
" possible_fname = url.rsplit(\"/\", 1)[1]\n",
" output = URL_string_filter(possible_fname)\n",
" else:\n",
" output = None\n",
" return output"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "FxQ5vzywRigD",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "4774ed3b-3bec-4075-b21b-e0801e6846a9"
},
"source": [
"import shutil, lzma, bz2, zlib # zipfile formats\n",
"import requests\n",
"from os.path import getsize, join, isdir\n",
"from datetime import datetime\n",
"\n",
"\n",
"def get_zip_URL(URLtoget, extract_loc=None, file_header=\"zipexport_\", verbose=False):\n",
"\n",
" r = requests.get(URLtoget, allow_redirects=True)\n",
" names = getFilename_fromCd(r.headers.get(\"content-disposition\"))\n",
" try:\n",
" fixed_fnames = names.split(\";\") # split the multiple results\n",
" this_filename = file_header + URL_string_filter(fixed_fnames[0])\n",
" except:\n",
" this_filename = file_header + \".tar.gz\"\n",
" print(\"has no filename, using default of {}\".format(this_filename))\n",
"\n",
" # define paths and save the zip file\n",
" if extract_loc is None:\n",
" extract_loc = \"zip_download\"\n",
" dl_place = join(os.getcwd(), extract_loc)\n",
" os.makedirs(dl_place, exist_ok=True)\n",
" save_loc = join(os.getcwd(), this_filename)\n",
" open(save_loc, \"wb\").write(r.content)\n",
" if verbose:\n",
" print(\"downloaded file size was {} MB\".format(getsize(save_loc) / 1000000))\n",
"\n",
" # unpack the archive\n",
" shutil.unpack_archive(save_loc, extract_dir=dl_place)\n",
" if verbose:\n",
" print(\"extracted zip file - \", datetime.now())\n",
" filelist = []\n",
"\n",
" for root, dirs, files in os.walk(dl_place):\n",
" for file in files:\n",
" # append the file name to the list\n",
" filelist.append(os.path.join(root, file))\n",
"\n",
" print(\"a total of {} files in {}\".format(len(filelist), dl_place))\n",
"\n",
" # remove original\n",
" try:\n",
" os.remove(save_loc)\n",
" del save_loc\n",
" except:\n",
" print(\"unable to delete original zipfile - check if exists\", datetime.now())\n",
"\n",
" print(\"finished extracting link - \", datetime.now())\n",
"\n",
" return dl_place"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "u5lpLeEO-pVc"
},
"source": [
"download a file from colab"
]
},
{
"cell_type": "code",
"metadata": {
"id": "bbmuHa3c-rQh",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "30b6861b-9331-4949-c7bf-cc53e7ce5200"
},
"source": [
"from google.colab import files\n",
"from os.path import basename\n",
"\n",
"\n",
"def download_file(my_path, verbose=False):\n",
"\n",
" files.download(my_path)\n",
" if verbose:\n",
" print(\"initiated download of {} - \".format(basename(my_path)), datetime.now())"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8qJnA9sEpHhn"
},
"source": [
"## dropbox \n",
"\n",
"\n",
"- [tutorial](https://python.plainenglish.io/automate-your-pdf-upload-to-dropbox-python-script-bdacc2c721f6)\n",
"- [api docs](https://dropbox-sdk-python.readthedocs.io/en/latest/api/dropbox.html?highlight=files_upload#dropbox.dropbox_client.Dropbox.files_upload)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "oz_GboGqZrwa",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"cellView": "form",
"outputId": "a9aa1c0d-9162-4290-d353-8bdb320339f4"
},
"source": [
"dropbox_subfolder = \"info-retrieval fall 2021\" # @param {type:\"string\"}\n",
"token = \"7JM1V7L7-0kAAAAAAAAAAc1mKnyh-G-4YwfL2o9WNJ2Tdh2JJslf_U5IxGrgb-J-\" #@param {type:\"string\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ErlmBoWaW5EZ",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 176
},
"outputId": "d9e410f1-feab-4f39-ca04-5fe0caffbc45"
},
"source": [
"dbx = dropbox.Dropbox(token) # if using dropbox, put your token in the above field\n",
"pp.pprint(dbx.users_get_current_account(), compact=True, indent=4)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"FullAccount(account_id='dbid:AAAYVJX_n2JYa7yh6anfANmwSJ7qkr-u8Do', account_type=AccountType('pro', None), country='CH', disabled=False, email='peterszemraj@gmail.com', email_verified=True, is_paired=False, locale='en', name=Name(abbreviated_name='PS', display_name='Peter Szemraj', familiar_name='Peter', given_name='Peter', surname='Szemraj'), profile_photo_url='https://dl-web.dropbox.com/account_photo/get/dbaphid%3AAACl3aYwbYAysBHXpGdGBXZj6IYsXocikxc?size=128x128&vers=1628569648991', referral_link='https://www.dropbox.com/referrals/AAAVOEoeLmg4Nw7hRIA9Wsn8OMf2_EdGbik?src=app9-10920512', root_info=UserRootInfo(home_namespace_id='9560958', root_namespace_id='9560958'), team=NOT_SET, team_member_id=NOT_SET)\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "TZ7ZrQfrJmQQ",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "92a2893a-3327-4118-e8fe-d2bc9473ce76"
},
"source": [
"def get_size_mb(path2file, verbose=False):\n",
"\n",
" file_stats = os.stat(path2file)\n",
"\n",
" file_size_mb = {file_stats.st_size / (1024 * 1024)}\n",
" if verbose:\n",
" print(f\"File Size in MegaBytes is {file_size_mb}\")\n",
" return round(list(file_size_mb)[0], 2) # returns rounded to 2 decimals"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "vrV1mVSibJIy",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "bcc2a74c-de1b-4ce3-f776-ad5b5b0e2c43"
},
"source": [
"from os.path import join, basename, dirname\n",
"import time, random\n",
"\n",
"\n",
"def put_in_dropbox(\n",
" vm_path, subfolder=dropbox_subfolder, no_printout=True, ncalls=0, max_calls=3\n",
"):\n",
" if ncalls > max_calls:\n",
" return \"failed saving to DropBox - {} tries\".format(ncalls)\n",
" # for an item on the colab machine on path, upload to dropbox app folder at\n",
" # subfolder/\"filename\"\n",
" base_filename = basename(vm_path)\n",
" db_path = \"/{}/{}\".format(subfolder, base_filename)\n",
" try:\n",
" with open(vm_path, \"rb\") as f:\n",
" dbx.files_upload(f.read(), path=db_path, autorename=True, mute=no_printout)\n",
" except:\n",
" print(\n",
" \"WARNING - unable to post in dropbox, retry no. {} - \".format(ncalls + 1),\n",
" datetime.now(),\n",
" )\n",
" time.sleep(random.randint(1, 3)) # small delay before trying again\n",
" put_in_dropbox(vm_path, ncalls=ncalls + 1) # recursion for retrying"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hcu7JY0fNy81"
},
"source": [
"# process documents"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "q3dSo7ZtA6Nl"
},
"source": [
"### Document Store\n",
"\n",
"> FAISS is a library for efficient similarity search on a cluster of dense vectors.\n",
"The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood\n",
"to store the document text and other meta data. The vector embeddings of the text are\n",
"indexed on a FAISS Index that later is queried for searching answers.\n",
"The default flavour of FAISSDocumentStore is \"Flat\" but can also be set to \"HNSW\" for\n",
"faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.\n",
"For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n",
"\n",
"- [link](https://haystack.deepset.ai/docs/latest/apidatabasemd#Module-faiss) to haystack API on doc store\n",
"\n",
"\n",
"**elasticsearch**\n",
"\n",
"```\n",
"Username\n",
"elastic\n",
"Password\n",
"ry0WyUEZiuRZFj2z5QZFl8vp\n",
"```\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
},
"id": "ggPXF9rWhW2h"
},
"source": [
"### Cleaning & indexing documents\n",
"\n",
"download, convert and index to FAISS DocumentStore"
]
},
{
"cell_type": "code",
"metadata": {
"id": "WMHL9-v7TYHn",
"cellView": "form",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "52a031d0-d33b-4824-9c38-103a31234727"
},
"source": [
"URL_to_archive = \"https://www.dropbox.com/sh/xioggpj5h2lqrt9/AAD865bEus7OaI-Q87I8muS1a?dl=1\" # @param {type:\"string\"}\n",
"postgres_url = \"postgres://jamdqivbdmulkt:849bec75e91a29ef2940298bd8256cf433621a66161d1e86c308f642610757fe@ec2-54-220-35-19.eu-west-1.compute.amazonaws.com:5432/d9qsafiqdeq81c\" # @param {type:\"string\"}\n",
"use_elasticsearch = False # @param {type:\"boolean\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "0sP88vM04-7x",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "a193402a-d675-4849-97d6-39f56af8e46c"
},
"source": [
"%%time\n",
"if use_elasticsearch:\n",
" from haystack.utils import launch_es\n",
"\n",
" launch_es()\n",
"\n",
" # In Colab / No Docker environments: Start Elasticsearch from source\n",
" ! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
" ! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
" ! chown -R daemon:daemon elasticsearch-7.9.2\n",
"\n",
" import os\n",
" from subprocess import Popen, PIPE, STDOUT\n",
"\n",
" es_server = Popen(\n",
" [\"elasticsearch-7.9.2/bin/elasticsearch\"],\n",
" stdout=PIPE,\n",
" stderr=STDOUT,\n",
" preexec_fn=lambda: os.setuid(1), # as daemon\n",
" )\n",
" # wait until ES has started\n",
" ! sleep 30"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"CPU times: user 2 µs, sys: 1 µs, total: 3 µs\n",
"Wall time: 5.72 µs\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "6L_gvC6r5WkM",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "df3b958e-44f4-450f-8206-340cf275fc89"
},
"source": [
"if use_elasticsearch:\n",
" from haystack.document_store.elasticsearch import ElasticsearchDocumentStore\n",
"\n",
" # use local instance until decide on online provider\n",
" document_store = ElasticsearchDocumentStore(\n",
" host=\"localhost\",\n",
" username=\"\",\n",
" password=\"\",\n",
" index=\"document\",\n",
" return_embedding=True,\n",
" duplicate_documents=\"skip\",\n",
" similarity=\"dot_product\",\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "1cYgDJmrA6Nv",
"pycharm": {
"name": "#%%\n"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "4523f923-f335-4688-e4f1-e38c4c280f3d"
},
"source": [
"from haystack.document_store.faiss import FAISSDocumentStore\n",
"\n",
"if postgres_url.startswith(\"postgres://\"):\n",
" postgres_url = postgres_url.replace(\"postgres://\", \"postgresql://\", 1)\n",
"\n",
"if not use_elasticsearch:\n",
"\n",
" document_store = FAISSDocumentStore(\n",
" faiss_index_factory_str=\"Flat\",\n",
" return_embedding=True,\n",
" similarity=\"dot_product\",\n",
" progress_bar=True,\n",
" duplicate_documents=\"skip\",\n",
" # sql_url=postgres_url\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "YK3G4UV3RhpG"
},
"source": [
"The hyperparameter `words_per_doc` *drastically* changes performance and HayStack recommends leaving it at 100. Would only make sense changing if the relevant results of the intended queries are super long **and** the amount of documents itself is relatively small"
]
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "Oc86ThopfGJD",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "b40171b2-ac20-456b-f28b-88ba70181e5e"
},
"source": [
"words_per_doc = 100 # @param {type:\"integer\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"id": "1I0c27eXhW2i",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"outputId": "7e358b7f-9b4f-4dbf-c258-c3d256ded5a3"
},
"source": [
"%%capture\n",
"# custom dataset loaded above\n",
"path_to_dataset = get_zip_URL(\n",
" URL_to_archive, extract_loc=\"text_corpus\", file_header=\"source_text\", verbose=True\n",
")\n",
"\n",
"# Convert files to dicts\n",
"dicts = convert_files_to_dicts(\n",
" dir_path=path_to_dataset,\n",
" clean_func=clean_wiki_text,\n",
" split_paragraphs=True,\n",
")\n",
"preprocessor = PreProcessor(\n",
" clean_empty_lines=True,\n",
" clean_whitespace=True,\n",
" clean_header_footer=True,\n",
" split_by=\"word\",\n",
" split_length=words_per_doc,\n",
" split_respect_sentence_boundary=True,\n",
")\n",
"nested_docs = [preprocessor.process(d) for d in dicts]\n",
"docs = [d for x in nested_docs for d in x]\n",
"\n",
"# Now, let's write the dicts containing documents to our DB.\n",
"document_store.delete_documents()\n",
"\n",
"document_store.write_documents(docs, duplicate_documents=\"skip\", batch_size=30000)\n",
"clear_jupyter_cell()\n",
"print(\"documents written - \", datetime.now())"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"One or more sentence found with word count higher than the split length.\n",
"Duplicate Documents: Document with id '3fc75acbf0d22b41ff99f2f809174377' already exists in index 'document'\n",
"Duplicate Documents: Document with id '3fc75acbf0d22b41ff99f2f809174377' already exists in index 'document'\n",
"Duplicate Documents: Document with id '2b76ff7cdc3c33bcd5202a40c92f9db8' already exists in index 'document'\n",
"Duplicate Documents: Document with id '84c6abdd728ec341ee4ed52075691c98' already exists in index 'document'\n",
"Duplicate Documents: Document with id '14792a0c3e8661b4eb5d375c02be867f' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'c59e8fd4f89f4d294b63ae5d3f5a14ee' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'cf796768b357c5401091640b2ee3749' already exists in index 'document'\n",
"Duplicate Documents: Document with id '7e2c8edd4d1197cdabf28b1845e9463c' already exists in index 'document'\n",
"Duplicate Documents: Document with id '7e2c8edd4d1197cdabf28b1845e9463c' already exists in index 'document'\n",
"Duplicate Documents: Document with id '388110455115d5900deb8ac135fe120d' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'bca9d0b79a7ec1f75c4dac5d48c9dc5c' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'bca9d0b79a7ec1f75c4dac5d48c9dc5c' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'bc11fba85359700f3bf04b0a18d296e1' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id '3910ff416058385da6e73929914939ac' already exists in index 'document'\n",
"Duplicate Documents: Document with id '8ed491800e03644aeacf2c214c2d3645' already exists in index 'document'\n",
"Duplicate Documents: Document with id '8ed491800e03644aeacf2c214c2d3645' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'a42431ba7e109d9af26257a87f0fdab2' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'c6e8278fe813ad8311a022b9fc4e366a' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'c6e8278fe813ad8311a022b9fc4e366a' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'c6e8278fe813ad8311a022b9fc4e366a' already exists in index 'document'\n",
"Duplicate Documents: Document with id '39f449504c28989b4a9005a2189b15e7' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e3f237825aa1e67b7d82d3d8712e9ed' already exists in index 'document'\n",
"Duplicate Documents: Document with id '2b6761e3140bdbb3307f8ad8c3595b89' already exists in index 'document'\n",
"Duplicate Documents: Document with id '2b6761e3140bdbb3307f8ad8c3595b89' already exists in index 'document'\n",
"Duplicate Documents: Document with id '2b6761e3140bdbb3307f8ad8c3595b89' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e690647cf9dc54968ab41a146d248dac' already exists in index 'document'\n",
"Duplicate Documents: Document with id '2b6761e3140bdbb3307f8ad8c3595b89' already exists in index 'document'\n",
"Duplicate Documents: Document with id '2b6761e3140bdbb3307f8ad8c3595b89' already exists in index 'document'\n",
"Duplicate Documents: Document with id '6c3fd34e3517e4a2c6649a1ae676a772' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id '15e0895249d890ad76973a7445f0ab8b' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id '15e0895249d890ad76973a7445f0ab8b' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id '15e0895249d890ad76973a7445f0ab8b' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id '15e0895249d890ad76973a7445f0ab8b' already exists in index 'document'\n",
"Duplicate Documents: Document with id '9770aa34ee13ba6bff77b30199696947' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'de1e99c040567f7c9cc8b465b2d3e7c' already exists in index 'document'\n",
"Duplicate Documents: Document with id '9770aa34ee13ba6bff77b30199696947' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'de158208176ea05b06ad5aa070347923' already exists in index 'document'\n",
"Duplicate Documents: Document with id '9770aa34ee13ba6bff77b30199696947' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id '8abe53acd2a56278f5df744d9a5ce96b' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'ee567b1b888eea47026a0f1107c82852' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'ee567b1b888eea47026a0f1107c82852' already exists in index 'document'\n",
"Duplicate Documents: Document with id '6f6e72c87e186fc027ea73753ca2b270' already exists in index 'document'\n",
"Duplicate Documents: Document with id '6f6e72c87e186fc027ea73753ca2b270' already exists in index 'document'\n",
"Duplicate Documents: Document with id '6f6e72c87e186fc027ea73753ca2b270' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'fd2ffa74347778ab35726e29aec775f6' already exists in index 'document'\n",
"Duplicate Documents: Document with id '6465c358ec88dfa00f21b318abb701f7' already exists in index 'document'\n",
"Duplicate Documents: Document with id '15e0895249d890ad76973a7445f0ab8b' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id '15e0895249d890ad76973a7445f0ab8b' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'd92240ca8a94f307c7d7b8547dbf000f' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'd92240ca8a94f307c7d7b8547dbf000f' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id '15e0895249d890ad76973a7445f0ab8b' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id '15e0895249d890ad76973a7445f0ab8b' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id '290a1f88996b2c0df8b2c9bb575c3348' already exists in index 'document'\n",
"Duplicate Documents: Document with id '290a1f88996b2c0df8b2c9bb575c3348' already exists in index 'document'\n",
"Duplicate Documents: Document with id '290a1f88996b2c0df8b2c9bb575c3348' already exists in index 'document'\n",
"Duplicate Documents: Document with id '290a1f88996b2c0df8b2c9bb575c3348' already exists in index 'document'\n",
"Duplicate Documents: Document with id '290a1f88996b2c0df8b2c9bb575c3348' already exists in index 'document'\n",
"Duplicate Documents: Document with id '290a1f88996b2c0df8b2c9bb575c3348' already exists in index 'document'\n",
"Duplicate Documents: Document with id '314ca695b6eb2f615a135f667daf934d' already exists in index 'document'\n",
"Duplicate Documents: Document with id '167095313d23643e271da6859578e06b' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'dc8974f6a50e04606535f450e89892bc' already exists in index 'document'\n",
"Duplicate Documents: Document with id '7e998b897cbd73bac0dc98c6d0af1b50' already exists in index 'document'\n",
"Duplicate Documents: Document with id '4504881aa9e534c91490b7a1996a0c0c' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'becbcadffc6a523fdfc1d0a88bb275e2' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id '61fb3113463a58abbbe8049a064da8f4' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id '15e0895249d890ad76973a7445f0ab8b' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'ffe76364f1f1f187b7bf13b8f832da7b' already exists in index 'document'\n",
"Duplicate Documents: Document with id '9eca475e4f516674adeda96dda1d0805' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id '660cc16be67be14d506c5c6e00d34122' already exists in index 'document'\n",
"Duplicate Documents: Document with id '2d28613c4c11228d07e231232dd27bab' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'cbcd87a065ead9436e39d7839210b3e3' already exists in index 'document'\n",
"Duplicate Documents: Document with id '2afda833290b0b94adf51af4652f1a31' already exists in index 'document'\n",
"Duplicate Documents: Document with id '53ba583815e476795da744e968328b61' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'a9a5ff6e7577845caa03220e819a6ff' already exists in index 'document'\n",
"Duplicate Documents: Document with id '253db3a5ffe8075a732da8610b763d3c' already exists in index 'document'\n",
"Duplicate Documents: Document with id '149771a222b08e4ba64b80e980493074' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'bf6c09258a5d5e234333d65de23b940e' already exists in index 'document'\n",
"Duplicate Documents: Document with id '79700721609f2f898cea60c493766f50' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'c6e8fd36e9d5774b03c0042d8be66ead' already exists in index 'document'\n",
"Duplicate Documents: Document with id '308a4a1cba5403956467f866944e648e' already exists in index 'document'\n",
"Duplicate Documents: Document with id '93b38e8744ce0c56a1a55ab7726710b3' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'c7fd923a49a23bb878063d7fe31da36' already exists in index 'document'\n",
"Duplicate Documents: Document with id '41748b79fca9b806fb6e0028ade08dc9' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e0efdbb5a69d07e1efac192b3121c7d0' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'd264a11bca4b1ce319cfce554267460' already exists in index 'document'\n",
"Duplicate Documents: Document with id '5c593954616ad3c23d37547c266b8024' already exists in index 'document'\n",
"Duplicate Documents: Document with id '883780258bcea61a2f7aa64f3f6e1a87' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'cc5dc0da39d156c4f28be7f7b1bd39d8' already exists in index 'document'\n",
"Duplicate Documents: Document with id '921895de3b46073d1da9af36254c1e8' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'c19c6e1af500cd88020ad0c2ca3547ec' already exists in index 'document'\n",
"Duplicate Documents: Document with id '18ec18ff5f6c608379fb93bb0d14601e' already exists in index 'document'\n",
"Duplicate Documents: Document with id '5f724b10fbb4cfab5481b3a80e999520' already exists in index 'document'\n",
"Duplicate Documents: Document with id '8145b539b74c5ecb845999ee6773c2b7' already exists in index 'document'\n",
"Duplicate Documents: Document with id '1a9a2f9323dae88b6a7d764dba2b80ea' already exists in index 'document'\n",
"Duplicate Documents: Document with id '470c339c287828923809d7cb1dbf5c6f' already exists in index 'document'\n",
"Duplicate Documents: Document with id '3dafc6980be5367a1e64cfbdb3a929ae' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'ec12dd3770fc0d9a905996c517dfda0a' already exists in index 'document'\n",
"Duplicate Documents: Document with id '46a466799df4e071620f5c5d79762a27' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'ae0007f838a2a8e2341b159ba9843a5' already exists in index 'document'\n",
"Duplicate Documents: Document with id '6d1e4cde1407991f5cb3f8e4109c8ebc' already exists in index 'document'\n",
"Duplicate Documents: Document with id '494dde33452913fae163b02e60f5502' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b01b255723ecc080dec9f8dcc0c0c883' already exists in index 'document'\n",
"Duplicate Documents: Document with id '9dbd5116d6164b0478a570591891d4d8' already exists in index 'document'\n",
"Duplicate Documents: Document with id '9fc855a862eff28f95456ee0d8f213d9' already exists in index 'document'\n",
"Duplicate Documents: Document with id '5b8f074b51660ad8941243f67ae47783' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'ce020d19512cb7712b3c9c529e67a7f8' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e8b2c476799e3fd3dcde1c8cf0fd693c' already exists in index 'document'\n",
"Duplicate Documents: Document with id '6c3e3f0b03e68e8412cccd8a6361638f' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'f1c2d481ef5c2dbe5859d765a4cc187e' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'd0a6048ec0f7f3c35c48520c91ac42ee' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'cc2a9e7979ada4faeab5ca91124e2ffb' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'd377f33b8b2324cb6b368291ec682a65' already exists in index 'document'\n",
"Duplicate Documents: Document with id '21f18bedc6831de9ffc90d68a39b6f8f' already exists in index 'document'\n",
"Duplicate Documents: Document with id '12e12c232bff74d9edf5b31f73abd98d' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'ba7ac3101422a213f13db1a237427140' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'ff7b704fe1f1410605748db51c89ce18' already exists in index 'document'\n",
"Duplicate Documents: Document with id '74af7cad2cacc4baed66f6642bf44d4c' already exists in index 'document'\n",
"Duplicate Documents: Document with id '154ac6a6fb6247737bcb2d14c4bb94c7' already exists in index 'document'\n",
"Duplicate Documents: Document with id '6f12519bccab4a732f578dc4d8546652' already exists in index 'document'\n",
"Duplicate Documents: Document with id '244e8f71e45e9d690bd17fe06df3c891' already exists in index 'document'\n",
"Duplicate Documents: Document with id '3800dd28345881d2fb46385cf43e2272' already exists in index 'document'\n",
"Duplicate Documents: Document with id '260d29daeabf8db035263e3840fe306c' already exists in index 'document'\n",
"Duplicate Documents: Document with id '475e4eeab018d3b66cdad1dd8414717b' already exists in index 'document'\n",
"Duplicate Documents: Document with id '6d40ea3f4f77c5c7fe606f7febb6001e' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'f6d49106fac479289e81b6a67b8eddb7' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e79a470dcae538c2e34364d1c01358e4' already exists in index 'document'\n",
"Duplicate Documents: Document with id '15702e2537fe19071e6ded645c84ad54' already exists in index 'document'\n",
"Duplicate Documents: Document with id '49befd77b2424d5780d88f577a8bb07b' already exists in index 'document'\n",
"Duplicate Documents: Document with id '47afc114e371031d2f374bc139f97573' already exists in index 'document'\n",
"Duplicate Documents: Document with id '163dd32ed2fbeeacad42e87b19893c46' already exists in index 'document'\n",
"Duplicate Documents: Document with id '3e579b0c7d1777247808e8c23fe978b7' already exists in index 'document'\n",
"Duplicate Documents: Document with id '87158a9d2ff055dd645cd1bc2eac0108' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'a5b4ae3d2870c95fe606b4d6309f1a48' already exists in index 'document'\n",
"Duplicate Documents: Document with id '8e19d85eff0a2c9aa646d3884288917d' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id '61fb3113463a58abbbe8049a064da8f4' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id '61fb3113463a58abbbe8049a064da8f4' already exists in index 'document'\n",
"Duplicate Documents: Document with id '61fb3113463a58abbbe8049a064da8f4' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'e2cb9558f6892b9dfc72317d75c54700' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'a6a8fd992e793874a885992a200d87b1' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'a6a8fd992e793874a885992a200d87b1' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'a6a8fd992e793874a885992a200d87b1' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'ffcd34dc3ad4224495470d7b58a21af2' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'a6a8fd992e793874a885992a200d87b1' already exists in index 'document'\n",
"Duplicate Documents: Document with id 'b1b1623e1c61c549caa42602e50fe352' already exists in index 'document'\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "VouRJydoPysN"
},
"source": [
"# Info Retrieval Architecture"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qeFya9Wl-8gj"
},
"source": [
"## Retriever\n",
"\n",
"- tutorial originally uses a `RetribertRetriever` and invokes `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore`:\n",
"\n",
"```\n",
"retriever = EmbeddingRetriever(document_store=document_store,\n",
" embedding_model=\"yjernite/retribert-base-uncased\",\n",
" model_format=\"retribert\")\n",
"```\n",
"\n",
"- based on some results from the other tutorials, `DensePassageRetriever` is better but slower. I use it here."
]
},
{
"cell_type": "code",
"metadata": {
"id": "kFwiPP60A6N7",
"pycharm": {
"is_executing": true
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 88
},
"outputId": "8b32d25a-78de-43a1-ce32-77f761d9b4f5"
},
"source": [
"%%time\n",
"from haystack.retriever.dense import EmbeddingRetriever\n",
"\n",
"# dense\n",
"retriever = DensePassageRetriever(\n",
" document_store=document_store,\n",
" query_embedding_model=\"facebook/dpr-question_encoder-single-nq-base\",\n",
" passage_embedding_model=\"facebook/dpr-ctx_encoder-single-nq-base\",\n",
" use_gpu=True,\n",
" embed_title=True,\n",
" max_seq_len_passage=256,\n",
" max_seq_len_query=128,\n",
" top_k=50, # tested up to 50\n",
" use_fast_tokenizers=False, \n",
" similarity_function=\"dot_product\",\n",
" progress_bar=True,\n",
" batch_size=16, # works partially on 32\n",
")\n",
"document_store.update_embeddings(retriever, update_existing_embeddings=False)\n",
"\n",
"clear_jupyter_cell()\n",
"print(\"Finished! - \", datetime.now(), \"\\n\\n\")"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Finished! - 2021-10-13 00:17:57.778464 \n",
"\n",
"\n",
"CPU times: user 1min 43s, sys: 3.53 s, total: 1min 47s\n",
"Wall time: 1min 48s\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cEmkX-mHPesa"
},
"source": [
"#### save embedded docstore"
]
},
{
"cell_type": "code",
"metadata": {
"id": "TKwBziYoPg98",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "868d7c81-60d3-4cd9-bad4-f8d086f272d7"
},
"source": [
"%%time\n",
"impossible = False # this section is useless, but not ready to drop yet\n",
"if not use_elasticsearch and impossible:\n",
"\n",
" from google.colab import files\n",
"\n",
" ds_date_time = datetime.now().strftime(\"_%m.%d.%Y_\")\n",
" ds_outname = \"DocStore - class {} - words {} - date {}.pkl\".format(\n",
" course_name, words_per_doc, ds_date_time\n",
" )\n",
" document_store.save(ds_outname)\n",
" print(\n",
" \"Finished saving files, total {} mb - \".format(get_size_mb(ds_outname)),\n",
" datetime.now(),\n",
" )\n",
" # try to store in cloud, if not, download\n",
" dsfs = round(get_size_mb(ds_outname), 2)\n",
" if dsfs < 150:\n",
" put_in_dropbox(ds_outname)\n",
" else:\n",
" print(\"can't put in dropbox - filesize is {} mb - downloading\".format(dsfs))\n",
" files.download(ds_outname)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"CPU times: user 3 µs, sys: 0 ns, total: 3 µs\n",
"Wall time: 6.44 µs\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sMlVEnJ2NkZZ"
},
"source": [
"#### validate retriever\n",
"\n",
"- Before blindly using the `Retriever` part of the pipeline,empirically test it to make sure a simple search indeed finds the relevant documents.\n",
"- the documents printed out (and text) should be relevant to the query. If not, the questions part won't work"
]
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "gpKCew7WU8kr",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "e56ad8e1-4418-4429-eefd-6766609fa440"
},
"source": [
"test_query = \"weisfehler-lehman kernel\" # @param {type:\"string\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "qpu-t9rndgpe",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"outputId": "77df482e-dca9-4818-e3bf-65fe157e36cd"
},
"source": [
"from haystack.utils import print_answers, print_documents\n",
"from haystack.pipeline import DocumentSearchPipeline\n",
"\n",
"p_retrieval = DocumentSearchPipeline(retriever)\n",
"res = p_retrieval.run(query=test_query, params={\"retriever\": {\"top_k\": 5}})\n",
"print_documents(res, max_text_len=256)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Query: weisfehler-lehman kernel\n",
"\n",
"{ 'name': 'conv_dm_and_ac_8_chapter_5_kernel_methods_TB.txt',\n",
" 'text': 'KERNEL METHODS 165 Gaussian Kernel The Gaussian kernel, also '\n",
" 'called the Gaussian radial basis function (RBF) kernel; is '\n",
" 'defined as Ilx -yll 202 K(x,y) = exp (5.10) where 0 0 is the '\n",
" 'spread parameter that plays the same role as the standard '\n",
" 'deviation in a no...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_6_the_vc_dimension_t.txt',\n",
" 'text': \"This claim is often referred to as Sauer' s lemma; but it has \"\n",
" 'also been stated and proved independently by Shelah and by '\n",
" 'Perles The formal statement is given in Section 6.5.1 later.'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_24_chapter_22_classification_assessment_TB.txt',\n",
" 'text': 'The concept of boosting was proposed in (Breiman, 1996) , and '\n",
" 'that of adaptive boosting in (Freund and Schapire, 1997). Random '\n",
" 'forests is a tree-based ensemble approach that can be very '\n",
" 'effective; see (Breiman; 2001) for details. For a compre hensive '\n",
" 'overv...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_13_regular_iz_ation_and_stability_t.txt',\n",
" 'text': 'In addition, practi- cal methods have been developed to introduce '\n",
" 'stability into learning algorithms, in particular the Bagging '\n",
" 'technique introduced by (Breiman 1996). Over the last decade, '\n",
" 'stability was studied as a generic condition for learnabil- ity: '\n",
" 'S...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_13_regular_iz_ation_and_stability_t.txt',\n",
" 'text': '13.7 Exercises 181 In the context of modern learning theory, the '\n",
" 'use of stability can be traced back at least to the work of '\n",
" 'Rogers & Wagner (1978) , which noted that the sensitiv- ity of a '\n",
" 'learning algorithm with regard to small changes in the sample '\n",
" 'cont...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_8_chapter_5_kernel_methods_TB.txt',\n",
" 'text': 'KERNEL METHODS 180 Neumann kernel is given as (0.75 0.02 0.11 '\n",
" '0.11 0.02 0.02 0.74 0.10 0.03 0.11 K = U(I _ 0.24)-1UT = 0.11 '\n",
" '0.10 0.66 0.10 0.03 0.11 0.03 0.10 0.66 0.10 0.02 0.11 0.03 0.10 '\n",
" '0.74 5.5 Further Reading Kernel methods have been extensively '\n",
" 'studi...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_22_chapter_19_decision_tree_class_i_fier_lda_TB.txt',\n",
" 'text': '20.3 Further Reading Linear discriminant analysis was introduced '\n",
" 'in (Fisher, 1936). Its extension to kernel discriminant analysis '\n",
" 'was proposed in (Mika et al,, 1999). The 2-class LDA approach can '\n",
" 'be generalized to k 2 classes by finding the optimal (k 1)-d...'}\n",
"\n",
"{ 'name': 'HW20-solution_kernel.txt',\n",
" 'text': \"is a ker- nel: k(x,x') is a combination of a polynomial kernel ( \"\n",
" \"(x,x') +0)2, a constant kernel, a linear kernel xTx' , and the \"\n",
" 'RBF kernel: The combination follows the closure properties that a '\n",
" 'sum of two kernels is a kernel and that multiplying a kernel w...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_13_regular_iz_ation_and_stability_t.txt',\n",
" 'text': 'In the next chapter we will present Stochastic Gradient Descent, '\n",
" 'which gives us a very practical alternative way to learn '\n",
" 'convex-Lipschitz-bounded and convex- smooth-bounded problems and '\n",
" 'can also be used for efficiently implementing the RLM rule_ 13.6 '\n",
" 'Bibl...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_10_boosting_t.txt',\n",
" 'text': 'AdaBoost also shows an equivalence between the existence of a '\n",
" 'weak learner and separability of the data using a linear '\n",
" 'classifier over the predictions of base hypotheses: This result '\n",
" \"is closely related to von Neumann's minimax theorem (von Neumann \"\n",
" '1928) ,...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_6_the_vc_dimension_t.txt',\n",
" 'text': 'However; in general, there is no equivalence between learnability '\n",
" 'and uniform convergence See (Shalev-Shwartz , Shamir , Srebro & '\n",
" 'Sridharan 2010, Daniely; Sabato, Ben-David & Shalev-Shwartz '\n",
" \"2011). Sauer's lemma has been proved by Sauer in response to a \"\n",
" 'pro...'}\n",
"\n",
"{ 'name': 'OCR_dmc_part_2_classification_algorithm.txt',\n",
" 'text': '(26) Data Mining 1, Basel Fall Semester 2020 154 159 D-BSSEL '\n",
" 'Karsten Bqngwardt\\n'\n",
" \"Elzurich Kernels Some useful kernels the constant 'all-ones' \"\n",
" 'kernel: klx,x) = 1 the delta (Dirac) kernel: 1 x=x 0 otherwise '\n",
" \"k(x,x') = D-BSSE Karsten Borgwardt Data Mining 1, Ba...\"}\n",
"\n",
"{ 'name': 'TB-v_mm_lt_bc_20_references_TB.txt',\n",
" 'text': 'Probabilistic Non-Linear Principal Component Analysis with '\n",
" 'Gaussian Process Latent Variable Models. Journal of Machine '\n",
" 'Learning Research, 6(Nov), 1783-1816. Leemis, Lawrence M., and '\n",
" 'McQueston, Jacquelyn T. 2008_ Univariate Distribution '\n",
" 'Relationships. Ameri...'}\n",
"\n",
"{ 'name': 'TB-v_mm_lt_bc_20_references_TB.txt',\n",
" 'text': 'Theory of Reproducing Kernels and its Applications Longman '\n",
" 'Scientific and Technical: Sarkka, Simo. 2013. Bayesian Filtering '\n",
" 'and Smoothing. Cambridge University Press. Scholkopf; Bernhard, '\n",
" 'and Smola, Alexander J. 2002. Learning with Kernels Support '\n",
" 'Vector ...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_36_index_t.txt',\n",
" 'text': \"Sauer, N. (1972) , 'On the density of families of sets' _ Journal \"\n",
" 'of Combinatorial Theory Series A 13, 145-147. Schapire; R: '\n",
" \"(1990), 'The strength of weak learnability' , Machine Learning \"\n",
" '5(2) , 197 227 . Schapire; R E. & Freund_ Y. (2012) , Boosting: '\n",
" 'Foun...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_10_boosting_t.txt',\n",
" 'text': 'Note that for every distribution D over R and every partitioning '\n",
" 'of the line into three such regions, one of these regions must '\n",
" 'have D-weight of at most 1/3. Let h € H be a zero error '\n",
" 'hypothesis. A decision stump that disagrees with h only on such a '\n",
" 'region...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1510_data_classification_TB.txt',\n",
" 'text': 'It has been pointed out [133] that the advantages of the primal '\n",
" 'approach in SVMs seem to have been largely overlooked in the '\n",
" 'literature: It is sometimes mistakenly understood that the kernel '\n",
" 'trick can only be applied to the dual; the trick can be applied '\n",
" 't...'}\n",
"\n",
"{ 'name': 'OCR_dmc_part_4_features_electio.txt',\n",
" 'text': 'D-BSSE Karsten Borgwardt Data Mining 1, Basel Fall Semester 2020 '\n",
" '246 259\\n'\n",
" 'Elzurich Hilbert-Schmidt Independence Criterion Definition '\n",
" '(Gretton et al,, 2005) The Hilbert-Schmidt Independence Criterion '\n",
" 'HSIC) measures the dependence of two random variables X an...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_25_index_TB.txt',\n",
" 'text': 'CHAPTER 22 CLASSIFICATION ASSESSMENT 647 data-specific kernel '\n",
" 'map, 158 diffusion kernel, 175 exponential, 176 power kernel, 176 '\n",
" 'von Neumann, 177 empirical kernel map, 157 Gaussian kernel, 165 '\n",
" 'graph kernel, 175 Hilbert space, 157 kernel matrix; 152 kernel '\n",
" 'o...'}\n",
"\n",
"{ 'name': 'OCR_dmc_part_2_classification_algorithm.txt',\n",
" 'text': 'D-BSSE Karsten Borgwardt Data Mining 1, Basel Fall Semester 2020 '\n",
" '107 159\\n'\n",
" 'Elzurich Logistic Regression D-BSSE Karsten Borgwardt Data Mining '\n",
" '1, Basel Fall Semester 2020 108 159\\n'\n",
" 'Elzurich Logistic Regression Concept (David Cox; 1958) Logistic '\n",
" 'Regression is a c...'}\n",
"\n",
"{ 'name': 'hw21_i_ml_21_solution_3_.txt',\n",
" 'text': 'Exercises Introduction to Machine Learning FS 2021 Institute for '\n",
" 'Machine Learning Dept: of Computer Science, ETH Zurich Prof. Dr. '\n",
" 'Andreas Krause, Prof: Dr. Fanny Yang Web: https : / /las inf '\n",
" 'ethz. ch/teaching/introml-s21 For questions, please refer to '\n",
" 'Piaz...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_8_chapter_5_kernel_methods_TB.txt',\n",
" 'text': 'Please do not distribute. Feedback is Welcome_ Note that this '\n",
" 'book shall be available for purchase from Cambridge University '\n",
" 'Press and other standard distribution channels, that no '\n",
" 'unauthorized distribution shall be allowed, and that the reader '\n",
" 'may make on...'}\n",
"\n",
"{ 'name': 'conv_intro_ml_14_clusterin.txt',\n",
" 'text': '-2 -4 Ezurich Introduction to Machine Learning; Spring 2021, '\n",
" 'Prof. Andreas Krause 29\\n'\n",
" 'Heuristic for determining k 80O0 7000 6000 6 50O0 8 4000 3 3000 '\n",
" \"2000 -4 10OO 10 k 1 99 Diminishing returns' in the loss function \"\n",
" 'Pick k so that increasing k leads to negl...'}\n",
"\n",
"{ 'name': 'ESL__21_reference.txt',\n",
" 'text': 'Evidence contrary to the statistical view of boosting (with '\n",
" 'discussion), Journal of Machine Learning Research 9: 131-156. '\n",
" 'Meinshausen, N. (2007)_ Relaxed lasso, Computational Statistics '\n",
" 'and Data Analysis 52(1): 374-393. 718 References Meinshausen, N. '\n",
" 'and B...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_22_chapter_19_decision_tree_class_i_fier_lda_TB.txt',\n",
" 'text': 'Feedback is Welcome_ Note that this book shall be available for '\n",
" 'purchase from Cambridge University Press and other standard '\n",
" 'distribution channels, that no unauthorized distribution shall be '\n",
" 'allowed, and that the reader may make one copy only for personal '\n",
" 'o...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1611_data_classification_advanced_concepts_TB.txt',\n",
" 'text': 'By iteratively using this approach and creating a weighted '\n",
" 'combination of the various classifiers, it is possible to create '\n",
" 'a classifier with lower overall bias. For example; in Fig: 11.6a, '\n",
" 'each individual SVM is not globally optimized and is accurate '\n",
" 'only...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_21_online_learning_t.txt',\n",
" 'text': 'An illustration of a shattered tree of depth 2 is given in Figure '\n",
" \"21.1. DEFINITION 21.5 (Littlestone's Dimension (Ldim)) Ldim(H) is \"\n",
" 'the maximal integer T such that there exists a shattered tree of '\n",
" 'depth T, which is shattered by H. The definition of Ldim an...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_116_cluster_analysis_TB.txt',\n",
" 'text': 'For a d-dimensional data set, the Gaussian kernel is defined as '\n",
" 'follows: W-Xil? e 2.h2 K(X _ Xi) = hV2t (6.19) The term IIX Xill '\n",
" 'represents the Euclidean distance between these d-dimensional '\n",
" 'data points Intuitively, the effect of kernel-density estimation '\n",
" '...'}\n",
"\n",
"{ 'name': 'PRML_20_appendix_b_probability_distribution.txt',\n",
" 'text': 'U(x a,b) (B.73) b _ a (6 + a) 2 (b _ a)2 12 In(b = a) Elx] B.74) '\n",
" 'var[x] Hlx] (B.75) (B.76) If x has distribution U(w/0, 1) , then '\n",
" 'a + (b = a)x will have distribution U(w/a,b)\\n'\n",
" 'B PROBABILITY DISTRIBUTIONS 693 Von Mises The von Mises '\n",
" 'distribution, also known ...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_26_bibliography_TB.txt',\n",
" 'text': '[449] B. Scholkopf; and A J. Smola Learning with kernels: '\n",
" 'support vector machines, reg- ularization; optimization, and '\n",
" 'beyond_ Cambridge University Press, 2001_ [450] B. Scholkopf; A. '\n",
" 'Smola; and K-R: Muller . Nonlinear component analysis as a kernel '\n",
" 'eigen...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_8_chapter_5_kernel_methods_TB.txt',\n",
" 'text': 'Von Neumann Diffusion Kernel related kernel based on powers of S '\n",
" 'is the uon Neumann diffusion kernel, defined as K = 8lsl (5.17) '\n",
" '1=0 where 8 > 0_ Expanding the above, we have K =I+ BS + 8252 + '\n",
" '8353 + = I+ BS(I + BS + 82s2 + .- = I+ BSK Rearranging the '\n",
" 'term...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_10_boosting_t.txt',\n",
" 'text': 'In the next chapter we will study how to tune parameters such as '\n",
" 'T, based on the data. 10.6 Bibliographic Remarks As mentioned '\n",
" 'before, boosting stemmed from the theoretical question of whether '\n",
" 'an efficient weak learner can be \"boosted\" into an efficient '\n",
" 'st...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_18_chapter_15_density_based_clustering_dbs_can_TB.txt',\n",
" 'text': 'DENSITY-BASED CLUSTERING 433 15.4 Further Reading Kernel density '\n",
" 'estimation was developed independently in (Rosenblatt; 1956) and '\n",
" '(Parzen, 1962) _ For an excellent description of density '\n",
" 'estimation techniques see (Silverman; 1986) . The density-based '\n",
" 'DBSCA...'}\n",
"\n",
"{ 'name': 'TB-v_mm_lt_bc_95_vector_calculus_TB.txt',\n",
" 'text': '82 Oyaw J: is the partial derivative obtained by first partial '\n",
" 'differ- entiating with respect to & and then with respect to y_ '\n",
" '82 f Jroy is the partial derivative obtained by first partial '\n",
" 'differentiating by y and then x. Hessian The Hessian is the '\n",
" 'collect...'}\n",
"\n",
"{ 'name': 'PRML_20_appendix_b_probability_distribution.txt',\n",
" 'text': 'For large m, the von Mises distribution is approximately a '\n",
" 'Gaussian centred on 00- Wishart The Wishart distribution is the '\n",
" 'conjugate prior for the precision matrix of a multi- variate '\n",
" 'Gaussian.'}\n",
"\n",
"{ 'name': 'PRML_106_kernel_method.txt',\n",
" 'text': 'Figure 6.4 shows samples of functions drawn from Gaus- sian '\n",
" 'processes for two different choices of kernel function: The first '\n",
" 'of these is a Gaussian kernel of the form (6.23), and the second '\n",
" \"is the exponential kernel given by k(c, = I' ) = exp (-0 |x = %' \"\n",
" '...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_11_model_selection_and_validation_t.txt',\n",
" 'text': 'We assume that for every d, the class Hd enjoys the uniform '\n",
" 'convergence property (see Definition 4.3 in Chapter 4) with a '\n",
" 'sample complexity function of the form UC g(d) log(1/6) mAa (6,6) '\n",
" '(11.1) where g N _ R is some monotonically increasing function. '\n",
" 'For ...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_10_boosting_t.txt',\n",
" 'text': 'b : 0 € R,b e {+1}}. In the following we show that ERMB is a '\n",
" '~-weak learner for H, for = 1/12. 10.1 Weak Learnability 133 To '\n",
" 'see that, we first show that for every distribution that is '\n",
" 'consistent with H, there exists a decision stump with Lp(h) < '\n",
" '1/3. In...'}\n",
"\n",
"{ 'name': 'ESL__1614_unsupervised_learnin.txt',\n",
" 'text': 'Example: Simulations 8 0 FastICA KernelICA ProdDenICA 4 2 8 8 8 1 '\n",
" '8 1 3 3 b d g h k m 0 p Distribution FIGURE 14.42. The left panel '\n",
" 'shows 18 distributions used for comparisons. These include the '\n",
" '\"t\" uniform, exponential, mixtures of exponentials, symmetric...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_7_non_uniform_learn_ability_t.txt',\n",
" 'text': 'Furthermore, the sharp-eyed reader may notice that the \"bad '\n",
" 'learner we introduced in Chapter 2 1 In the literature, '\n",
" 'consistency is often defined using the notion of either '\n",
" 'convergence in probability (corresponding to weak consistency) or '\n",
" 'almost sure conver...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_2520_privacy_preserving_data_mining_TB.txt',\n",
" 'text': 'Two common instantiations of the distance function are as '\n",
" 'follows: 1 Variational distance: This is simply equal to half the '\n",
" 'Manhattan distance between the two distribution vectors: i=1 Ipi '\n",
" '2 qi Dist(P,Q) = (20.13) 2 Kullback-Leibler (KL) distance: This '\n",
" 'is ...'}\n",
"\n",
"{ 'name': 'TB-v_mm_lt_bc_20_references_TB.txt',\n",
" 'text': 'Matrix Differential Calculus with Appli- cations in Statistics '\n",
" 'and Econometrics Wiley . References 401 Manton, Jonathan H: and '\n",
" 'Amblard, Pierre-Olivier: 2015. A Primer on Reproducing Kernel '\n",
" 'Hilbert Spaces. Foundations and Trends in Signal Processing, '\n",
" '8(1-2)...'}\n",
"\n",
"{ 'name': 'TB-v_mm_lt_bc_21_index_TB.txt',\n",
" 'text': 'Index 409 inverse element; 36 invertible, 24 Isomap, 136 '\n",
" 'isomorphism, 49 Jacobian, 146, 150 Jacobian determinant; 152 '\n",
" \"Jeffreys-Lindley paradox, 287 Jensen's inequality 239 joint \"\n",
" 'probability 178 Karhunen-Loeve transform; 318 kernel; 33, 47, 58, '\n",
" '254, 388 ker...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_30_compression_bounds_t.txt',\n",
" 'text': '30.3 Bibliographic Remarks Compression schemes and their relation '\n",
" 'to learning were introduced by Little- stone & Warmuth (1986) _ '\n",
" 'As we have shown; if a class has a compression scheme then it is '\n",
" 'learnable For binary classification problems, it follows from...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_20_chapter_17_clustering_validation_TB.txt',\n",
" 'text': 'The table below summarizes the various internal measure values '\n",
" 'for the good and bad clusterings shown in Figure 17.1 and Figure '\n",
" '17.2 lower better BetaCV Cindex DB NC a) Good 0.24 0.034 -0.23 '\n",
" '0.65 2.67 b Bad 0.33 0.08 -0.20 1.11 2.56 higher better Dunn SC '\n",
" '1...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_25_feature_selection_and_generation_t.txt',\n",
" 'text': '25.5 Bibliographic Remarks 371 25.5 Bibliographic Remarks Guyon '\n",
" \"Elisseeff (2003) sur 'veyed several feature selection \"\n",
" 'procedures, including many types of filters_ Forward greedy '\n",
" 'selection procedures for minimizing a convex objective sub- ject '\n",
" 'to a polyhed...'}\n",
"\n",
"{ 'name': 'ESL__1210_boosting_and_additive_tree.txt',\n",
" 'text': '(2000) analyze AdaBoost statistically; derive the exponential '\n",
" 'criterion, and show that it estimates the log-odds of the class '\n",
" 'probability: They propose additive tree models, the right-sized '\n",
" 'trees and ANOVA representation of Section 10.11, and the '\n",
" 'multiclas...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_22_chapter_19_decision_tree_class_i_fier_lda_TB.txt',\n",
" 'text': '19.2.2 Evaluating Split-points All of the split-point evaluation '\n",
" 'measures, like entropy (19.3), Gini-index (19.6) , and CART '\n",
" '(19.7) , considered above depend on the class probability mass '\n",
" 'function (PMF) for D, namely, P(cilD) , and the class PMFs for '\n",
" 'the ...'}\n",
"\n",
"{ 'name': 'hw21_i_ml_21_solution_6_.txt',\n",
" 'text': 'Exercises Introduction to Machine Learning SS 2021 Institute for '\n",
" 'Machine Learning Dept. of Computer Science, ETH Zirich Prof. Dr. '\n",
" 'Andreas Krause Web: https : / /las inf ethz '\n",
" 'ch/teaching/introml-s21 For questions, please refer to Piazza. '\n",
" 'Series 6, May 18th,...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_116_cluster_analysis_TB.txt',\n",
" 'text': ')_ r=I (6.8) This relationship can be shown using the basic '\n",
" 'definition of variance and is used by many clustering algorithms '\n",
" 'such as BIRCH (cf Chap: 7) . Therefore, for each cluster, one '\n",
" 'only needs to maintain these cluster-specific statistics Such '\n",
" 'statist...'}\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "rnVR28OXA6OA"
},
"source": [
"## Generator\n",
"\n",
"Similar to previous Tutorials we now initalize our reader/generator.\n",
"\n",
"Here we use a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see definition [here](https://huggingface.co/yjernite/bart_eli5))\n",
"\n",
"- usable models for generating answers are listed [here](https://huggingface.co/models?pipeline_tag=text2text-generation)\n",
"- per the [api text](https://haystack.deepset.ai/docs/latest/apigeneratormd) may need to do some additional stuff to get it to work\n",
"- on [generating text in general](https://huggingface.co/blog/how-to-generate)\n",
"- [building custom generators](https://huggingface.co/transformers/main_classes/model.html?transformers.generation_utils.GenerationMixin#transformers.generation_utils.GenerationMixin)\n",
"- [original implementation](https://yjernite.github.io/lfqa.html) of the bart Eli5 model\n",
"- *NOTE all custom generators need to be \"text 2 text\"*\n",
"\n",
"### Model pages & Docs\n",
"\n",
"1. [documentation](https://huggingface.co/transformers/model_doc/t5.html#t5tokenizer) for t5 model/tokenizer (standard)\n",
"2. [pegasus bigpatent](https://huggingface.co/google/pegasus-big_patent) - note this is not the bigbird variant\n",
"\n",
"\n",
"---\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Wmd3HbeHd1JP"
},
"source": [
"#### Custom Converter - How-To\n",
"\n",
"Need to pass in a pre-defined class object to Seq2SeqGenerator as a `input_converter`:\n",
"definition:\n",
"```\n",
"input_converter: an optional Callable to prepare model input for the underlying language model specified in model_name_or_path parameter.\n",
" The required call method signature for the Callable is: \n",
" call(tokenizer: PreTrainedTokenizer, query: str, documents: List[Document], top_k: Optional[int] = None) -> BatchEncoding:\n",
"```\n",
"\n",
"- [source](https://github.com/deepset-ai/haystack/pull/1086)\n",
"\n",
"**links for custom converter help**\n",
"\n",
"1. transformers [generation mixin docs](https://huggingface.co/transformers/main_classes/model.html?transformers.generation_utils.GenerationMixin#transformers.generation_utils.GenerationMixin)\n",
"2. Blog post [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate)\n",
"\n",
"3. check the github of haystack and search for `Seq2SeqGenerator` definition to find where the original class is defined (there is one for the default model)\n",
" - a [current link](https://github.com/deepset-ai/haystack/blob/17dcb8c23e2e79391965f84c80eff58522c65c52/haystack/generator/transformers.py) to the file. in `haystack/generator/transformers.py`\n",
"4. t5 prompts can be found in the [original paper](https://arxiv.org/pdf/1910.10683.pdf)\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "yzj-ZAfOQtVa"
},
"source": [
"> <font color=\"salmon\"> in this next section you define classes for the converter. basically check [huggingface docs](https://huggingface.co/transformers/model_doc/t5.html) for what the tokenizer wants, convert the input documents etc to that. </font>\n",
"\n",
"> ```Model type should be one of BigBirdPegasusConfig, M2M100Config, LEDConfig, BlenderbotSmallConfig, MT5Config, T5Config, PegasusConfig, MarianConfig, MBartConfig, BlenderbotConfig, BartConfig, FSMTConfig, EncoderDecoderConfig, XLMProphetNetConfig, ProphetNetConfig.```"
]
},
{
"cell_type": "code",
"metadata": {
"id": "toVrGrd-Qrgw",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "ac083def-a58f-4ddc-ed4f-13fea97f11a9"
},
"source": [
"from transformers import (\n",
" RagTokenizer,\n",
" RagTokenForGeneration,\n",
" AutoTokenizer,\n",
" AutoModelForSeq2SeqLM,\n",
" PreTrainedTokenizer,\n",
" BatchEncoding,\n",
")\n",
"from haystack import Document\n",
"from typing import Any, Dict, List, Optional\n",
"\n",
"\n",
"class _PegParaPhraseConv:\n",
" \"\"\"\n",
" A sequence-to-sequence model input converter (https://huggingface.co/yjernite/bart_eli5) based on the\n",
" BART architecture fine-tuned on ELI5 dataset (https://arxiv.org/abs/1907.09190).\n",
"\n",
" The converter takes documents and a query as input and formats them into a single sequence\n",
" that a seq2seq model can use it as input for its generation step.\n",
" This includes model-specific prefixes, separation tokens and the actual conversion into tensors.\n",
"\n",
" For more details refer to Yacine Jernite's excellent LFQA contributions at https://yjernite.github.io/lfqa.html\n",
" \"\"\"\n",
"\n",
" def __call__(\n",
" self,\n",
" tokenizer: PreTrainedTokenizer,\n",
" query: str,\n",
" documents: List[Document],\n",
" top_k: Optional[int] = None,\n",
" ) -> BatchEncoding:\n",
" # if there are specific required here add them (model dependent)\n",
" conditioned_doc = \" \".join([d.text for d in documents])\n",
"\n",
" # concatenate question and support document into BART input\n",
" query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
"\n",
" return tokenizer(\n",
" [query_and_docs],\n",
" truncation=True,\n",
" padding=\"longest\",\n",
" max_length=512,\n",
" return_tensors=\"pt\",\n",
" )\n",
"\n",
"\n",
"class _pegQA:\n",
" def __call__(\n",
" self,\n",
" tokenizer: PreTrainedTokenizer,\n",
" query: str,\n",
" documents: List[Document],\n",
" top_k: Optional[int] = None,\n",
" ) -> BatchEncoding:\n",
" # if there are specific required here add them (model dependent)\n",
" conditioned_doc = \" \".join([d.text for d in documents])\n",
"\n",
" # concatenate question and support document into BART input\n",
" query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
"\n",
" return tokenizer(\n",
" [query_and_docs],\n",
" truncation=True,\n",
" padding=\"longest\",\n",
" max_length=1024,\n",
" return_tensors=\"pt\",\n",
" )\n",
"\n",
"\n",
"class _T5asSummary:\n",
" # use case: t5 one line summary\n",
" def __call__(\n",
" self,\n",
" tokenizer: PreTrainedTokenizer,\n",
" query: str,\n",
" documents: List[Document],\n",
" top_k: Optional[int] = None,\n",
" ) -> BatchEncoding:\n",
" # if there are specific required here add them (model dependent)\n",
" conditioned_doc = \" \".join([d.text for d in documents])\n",
"\n",
" # concatenate question and support document into BART input\n",
" query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
"\n",
" return tokenizer(\n",
" \"summarize: \" + query_and_docs,\n",
" truncation=True,\n",
" padding=\"longest\",\n",
" return_tensors=\"pt\",\n",
" )\n",
"\n",
"\n",
"# https://huggingface.co/transformers/model_doc/gpt_neo.html\n",
"class _T5asQA:\n",
" # use case - all other instances of T5 in the generator\n",
" def __call__(\n",
" self,\n",
" tokenizer: PreTrainedTokenizer,\n",
" query: str,\n",
" documents: List[Document],\n",
" top_k: Optional[int] = None,\n",
" ) -> BatchEncoding:\n",
" # if there are specific required here add them (model dependent)\n",
" conditioned_doc = \" \".join([d.text for d in documents])\n",
"\n",
" # concatenate question and support document into BART input\n",
" query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
"\n",
" return tokenizer(\n",
" query_and_docs, truncation=True, padding=\"longest\", return_tensors=\"pt\"\n",
" )\n",
"\n",
"\n",
"class _LEDasQA:\n",
" ## for allen ai longformer\n",
" def __call__(\n",
" self,\n",
" tokenizer: PreTrainedTokenizer,\n",
" query: str,\n",
" documents: List[Document],\n",
" top_k: Optional[int] = None,\n",
" ) -> BatchEncoding:\n",
" # if there are specific required here add them (model dependent)\n",
" conditioned_doc = \" \".join([d.text for d in documents])\n",
"\n",
" # concatenate question and support document into model\n",
" query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
"\n",
" return tokenizer(\n",
" query_and_docs,\n",
" truncation=True,\n",
" padding=\"longest\",\n",
" max_length=512,\n",
" return_tensors=\"pt\",\n",
" )\n",
"\n",
"\n",
"class _BigBirdforQA:\n",
" # for google's pegasus-bigbird\n",
" def __call__(\n",
" self,\n",
" tokenizer: PreTrainedTokenizer,\n",
" query: str,\n",
" documents: List[Document],\n",
" top_k: Optional[int] = None,\n",
" ) -> BatchEncoding:\n",
" # if there are specific required here add them (model dependent)\n",
" conditioned_doc = \" \".join([d.text for d in documents])\n",
"\n",
" # concatenate question and support document into model\n",
" query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
"\n",
" return tokenizer(\n",
" query_and_docs,\n",
" truncation=True,\n",
" padding=\"longest\",\n",
" max_length=512,\n",
" return_tensors=\"pt\",\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AUmAPeSqKUBh"
},
"source": [
"mapping dict that tells haystack what tokenizer variant to use for a model"
]
},
{
"cell_type": "code",
"metadata": {
"id": "3oC62rpBaHYc",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "4ba6c3b9-96f7-43d0-edd1-b987090dc22c"
},
"source": [
"# create dict object of model name to converter\n",
"model_converters = {\n",
" \"ramsrigouthamg/t5_paraphraser\": _T5asQA(), # works, decent\n",
" \"tuner007/pegasus_qa\": _PegParaPhraseConv(), # works, good\n",
" \"google/t5-v1_1-large\": _T5asQA(), # works most of the time, mediocre\n",
" \"allenai/unifiedqa-t5-large\": _T5asQA(), # works & good\n",
" \"google/t5-large-ssm\": _T5asQA(), # crashes, need to investigate\n",
" \"Salesforce/qaconv-unifiedqa-t5-large\": _T5asQA(),\n",
" \"allenai/led-base-16384\": _LEDasQA(), # works & mediocre\n",
" \"google/bigbird-pegasus-large-bigpatent\": _BigBirdforQA(), # best\n",
" \"google/bigbird-pegasus-large-arxiv\": _BigBirdforQA(),\n",
" \"valhalla/distilt5-qa-qg-hl-12-6\": _T5asQA(), # works, meh\n",
" \"akshara23/Pegasus_for_Here\": _pegQA(), # works well\n",
" \"tuner007/pegasus_paraphrase\": _pegQA(),\n",
" \"google/pegasus-reddit_tifu\": _PegParaPhraseConv(),\n",
" \"google/pegasus-large\": _PegParaPhraseConv(),\n",
"}\n",
"\n",
"# SSM nq might need it's own class - input_ids = t5_tok(\"When was Franklin D. Roosevelt born?\", return_tensors=\"pt\").input_ids"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "LaCFopr_H-f0"
},
"source": [
"### Select model\n",
"\n",
"- the model used from huggingface.co as a generator should be defined above\n",
"- the default model `yjernite/bart_eli5` has decent outputs but the usefulness declines the more complicated the question is. \n",
"- **In general, best results are from `akshara23/Pegasus_for_Here`, `vasudevgupta/bigbird-pegasus-large-bigpatent`, and `google/pegasus-reddit_tifu`**\n",
"- nq = natural questions. [info on the dataset](https://huggingface.co/datasets/natural_questions)\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "lK5wWFo6bg8c",
"cellView": "form",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "4f23aaf7-ae02-4219-c221-f74b8e1146a6"
},
"source": [
"# decrease_if_crash (change model type)\n",
"cust_model_name = \"tuner007/pegasus_qa\" # @param [\"ramsrigouthamg/t5_paraphraser\", \"google/t5-v1_1-large\", \"allenai/unifiedqa-t5-large\", \"allenai/led-base-16384\", \"akshara23/Pegasus_for_Here\", \"google/pegasus-reddit_tifu\", \"google/bigbird-pegasus-large-bigpatent\", \"tuner007/pegasus_qa\", \"google/pegasus-large\", \"Salesforce/qaconv-unifiedqa-t5-large\"]\n",
"use_custom_model = True # @param {type:\"boolean\"}\n",
"model_min_l = 256 # @param {type:\"integer\"}\n",
"model_max_l = 2048# @param {type:\"integer\"}\n",
"download_answers_txtfile = False # @param {type:\"boolean\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "B1BmQ6c0Q1vS"
},
"source": [
"### build generator"
]
},
{
"cell_type": "code",
"metadata": {
"id": "bDRf3x5CXbhV",
"cellView": "form",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "2eda472d-2aa9-4fbb-dc89-52c9828d624d"
},
"source": [
"number_beam_search = 32 # @param {type:\"integer\"}\n",
"# decrease_if_crash"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "fyIuWVwhA6OB",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 177,
"referenced_widgets": [
"8b4925f18b6b4dd3a78b36831bdfd794",
"b1b4d0313c6b4050b265233ca11f85ae",
"7ee55c7f902e4d36934566d2674b44c7",
"50f2ca2a585b42d8804c87178caa1a3c",
"c8d01e6e25384ce2aa88efa410713458",
"a9c43c6632f34a1ea583f8ba287abffb",
"d53bb00fa62a45ff8fc944d2e8fe9f8e",
"11e5674ca6664ea79fc77c535a95b28c",
"5a1040d9b7334915aba96a9ee5601604",
"8edef780b03e42a39233b29f9d1b5f84",
"4dc52a2662ff4dbda5f2aea33bcbf059",
"f574a7ca3c614eebb4a95005541f4222",
"d9ac1c6647474271abc4d9a36e1291ac",
"f7f8bace323d4d6e82d50d905e220b20",
"4f91e1b0b1a243b2b2412470cebbac95",
"3a450da0c4894902b64fe096669b7430",
"55686fe0a21944cfb3b8c8a3d2a8701b",
"32e9704140064e6eaf4177a7f259e2d4",
"7e8a917ce52040fa83dc5c9f55dd47f8",
"4ac96dfb30464045999928f8b8ad84a2",
"fcf19025287544e59597bd34cd00e621",
"326b5113687d4d3b8880fc84392dca42",
"69fecc3080824a98835affb58b00d52b",
"368440c926b34b418208cc5095cdbec8",
"2cdc539883ec402993404c0291168811",
"fd46ef5261a143b7b7f500a3f2aa9698",
"72b2887e929b4fa8b6a7312bf1c88f34",
"ca439e33b100451ea2874ab2e43c6812",
"eb842c1fc52c40b4aabcd888a05ca995",
"3b8bb1707e544af0b01d078ab8845af9",
"9cedd17dc11f44599d8bec369ccd9ae6",
"9579fd27034f4fa2bd3d8ead02a287c4",
"c36a5d0edb8a4cb89b07a9c7f9a64878",
"c61fa952578944b7a6e31b1a85ed9136",
"661d296202464897b8980b715c896445",
"65a71be7d9424158b52ecb8b6c28277a",
"dab18c875b394056abfcd862e1d2fd82",
"ed9814df69bc482692b023164acb0370",
"4a1a6b54dade477a9319c7ee9fc4b3e4",
"d3dc35eeda414597b93333b72ea1ada7",
"569cf4c72fda4667a0d782412fa0065f",
"e2a0d47e4de94d689ce9fb3ea27cbd3b",
"308964c16df440ca85e79170c19ce5be",
"53e23ddcf3e04ead833155c1dcf27e6c",
"250fb8ee150548c396746b55afb9da2c"
]
},
"outputId": "1439dcae-f21e-4ab8-8e67-e3dcc47378d4"
},
"source": [
"gc.collect()\n",
"if (\n",
" \"t5\" in cust_model_name\n",
" and \"large\" in cust_model_name\n",
" and \"google\" in cust_model_name\n",
"):\n",
" print(\"using lower beamsearch count for google model - \", datetime.now(), \"\\n\\n\")\n",
" generator = Seq2SeqGenerator(\n",
" model_name_or_path=cust_model_name,\n",
" input_converter=model_converters.get(cust_model_name),\n",
" min_length=model_min_l,\n",
" max_length=model_max_l,\n",
" num_beams=4,\n",
" ) # to not overload\n",
"elif use_custom_model:\n",
" generator = Seq2SeqGenerator(\n",
" model_name_or_path=cust_model_name,\n",
" input_converter=model_converters.get(cust_model_name),\n",
" min_length=model_min_l,\n",
" max_length=model_max_l,\n",
" num_beams=number_beam_search,\n",
" )\n",
"else:\n",
" generator = Seq2SeqGenerator(\n",
" model_name_or_path=\"yjernite/bart_eli5\",\n",
" min_length=model_min_l,\n",
" max_length=model_max_l,\n",
" num_beams=number_beam_search,\n",
" )\n",
" cust_model_name = \"yjernite/bart_eli5\""
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8b4925f18b6b4dd3a78b36831bdfd794",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/1.14k [00:00<?, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f574a7ca3c614eebb4a95005541f4222",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/1.91M [00:00<?, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "69fecc3080824a98835affb58b00d52b",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/65.0 [00:00<?, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c61fa952578944b7a6e31b1a85ed9136",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/87.0 [00:00<?, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "250fb8ee150548c396746b55afb9da2c",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/2.28G [00:00<?, ?B/s]"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XqqPuSlhjW0G"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "unhLD18yA6OF"
},
"source": [
"# <center> Question-Answer Pipeline </center>\n",
"\n",
"- Use a Haystack `Pipeline` to build a search pipeline.\n",
"> Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.\n",
"To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions.\n",
"You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).\n",
"- **Use case:** \"I want to know a specific detail about a concept or how multiple concepts work together\"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "5A8O9w7PBgvg",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "fe5cc62a-ea75-4252-c3cd-5075c7933811"
},
"source": [
"questions_version = \"setA\" # @param {type:\"string\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "TssPQyzWA6OG",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "443ba677-dc6d-4a9a-896d-58b8aaac4f49"
},
"source": [
"from haystack.pipeline import GenerativeQAPipeline\n",
"\n",
"pipe = GenerativeQAPipeline(generator, retriever)\n",
"\n",
"print(\n",
" \"generated QA pipeline off of textgen {} - \".format(cust_model_name), datetime.now()\n",
")\n",
"# note: cust_model_name gets overwritten above, in case of \"default\" bart_eli5"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"generated QA pipeline off of textgen tuner007/pegasus_qa - 2021-10-13 00:19:13.897240\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bXlBBxKXA6OL"
},
"source": [
"## Single-Question Queries\n",
"\n",
"- <font color=\"salmon\"> **this is where questions should start to be entered** </font>\n",
"- <font color=\"salmon\"> NOTE that the default model - `yjernite/bart_eli5`- really does not like the word \"**how**\" and will just sound incredulous that you asked it something</font>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "atBL7GaqOc55",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "7151a3f2-f8c9-4ca8-e930-8bb30e1e98b8"
},
"source": [
"# create records dataframe\n",
"import pandas as pd\n",
"\n",
"info_queries = pd.DataFrame(\n",
" columns=[\"query\", \"response\", \"query_type\", \"doc_group\", \"model_name\", \"context\"]\n",
")"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "F2K4-aeVYOdd",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 529
},
"cellView": "form",
"outputId": "2a0c76b8-ea7a-4bd7-abb7-bbbf2107c989"
},
"source": [
"this_query = \"what is the formal definition of a metric?\" # @param {type:\"string\"}\n",
"topk_search = 50 # @param {type:\"integer\"} # decrease_if_crash\n",
"\n",
"this_response = pipe.run(query=this_query, params={\"Retriever\": {\"top_k\": topk_search}, \"Generator\": {\"top_k\": 1}},)\n",
"\n",
"\n",
"resp_text = clean_output(this_response[\"answers\"])\n",
"new_row = {\n",
" \"query\": this_query,\n",
" \"response\": resp_text,\n",
" \"query_type\": \"QA\",\n",
" \"doc_group\": course_name,\n",
" \"model_name\": cust_model_name,\n",
" \"context\": \"NA\",\n",
"}\n",
"info_queries = info_queries.append(new_row, ignore_index=True)\n",
"print(this_query, \"\\n\\n\")\n",
"pp.pprint(resp_text, indent=4)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/torch/_tensor.py:575: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.\n",
"To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at ../aten/src/ATen/native/BinaryOps.cpp:467.)\n",
" return torch.floor_divide(self, other)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"what is the formal definition of a metric? \n",
"\n",
"\n",
"(\"['a metric is used to refer to the set O universe of all en- tities under \"\n",
" 'study. Usually we are interested in certain characteristics Or parameters of '\n",
" 'the entire population (e.g; the mean age of all computer science students in '\n",
" 'the US). Instead, we try to make inferences about the population parameters '\n",
" 'by drawing a random sample from the population, and by computing appropriate '\n",
" 'statis- tics from the sample that give estimates of the corresponding '\n",
" 'population parameters of interest. This is a useful abstraction because it '\n",
" 'helps us conceptualize and structure the field of data mining more '\n",
" 'effectively: The data may have different formats or types. Social networks '\n",
" 'are linked networks of users, whereas information networks are links '\n",
" 'networks of of users and objects: 2 Usage-c centric applications: The user '\n",
" 'activity on the Web is mined to Make inferences. Often; we would like to '\n",
" 'emphasize that some object is a vector and then we use boldface letters ( '\n",
" 'e.g. x and A), sheer, and character thickness. Examples of logical '\n",
" 'representations of Web structure include social and information networks. '\n",
" 'Social Networks are linked Networks of Users, whereas Information Networks '\n",
" 'are Linked networks of Users and Objects: 2Usage-ccentric applications:The '\n",
" 'user activity in the Web has been mined to MAKE inferences. The reader is '\n",
" 'encouraged to skip this section and return to it if during the reading of '\n",
" \"the book some notation is unclear.']\")\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Fw-QiM7_WKcT",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 441
},
"cellView": "form",
"outputId": "d85f3567-82fd-4ab3-dd98-6b46cefb0fd2"
},
"source": [
"this_query = \"Why are distance functions important in machine learning?\" # @param {type:\"string\"}\n",
"topk_search = 50 # @param {type:\"integer\"} # decrease_if_crash\n",
"\n",
"this_response = pipe.run(\n",
" query=this_query, params={\"Retriever\": {\"top_k\": topk_search}, \"Generator\": {\"top_k\": 1}},\n",
")\n",
"resp_text = clean_output(this_response[\"answers\"])\n",
"new_row = {\n",
" \"query\": this_query,\n",
" \"response\": resp_text,\n",
" \"query_type\": \"QA\",\n",
" \"doc_group\": course_name,\n",
" \"model_name\": cust_model_name,\n",
" \"context\": \"NA\",\n",
"}\n",
"info_queries = info_queries.append(new_row, ignore_index=True)\n",
"print(this_query, \"\\n\\n\")\n",
"pp.pprint(resp_text, indent=4)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Why are distance functions important in machine learning? \n",
"\n",
"\n",
"(\"['fundamental to the effective design of data mining algorithms, because a \"\n",
" 'poor choice in this respect may be very detrimental to the quality of the '\n",
" 'results. Sometimes, data analysts use the Euclidean function as a \"black '\n",
" 'box\" without much thought about the overall impact of such a choice. This '\n",
" 'chap- ter will, therefore, use either of the terms \"distance function\" and '\n",
" '\"similarity function 75 depending on the domain at hand. For example, a '\n",
" 'distance-based clustering algorithm may group unre- lated data points '\n",
" 'because the distance function may poorly reflect the intrinsic semantic '\n",
" 'distances between data points with increasing dimensionality As a result, '\n",
" 'distance- based models of clustering, classification, and outlier detection '\n",
" 'are often qualitatively ineffective: This phenomenon is referred to as the '\n",
" '\"curse of dimensionality; a term first coined by Richard Bellman. A more '\n",
" 'sophisticated approach; referred toas ISOMAP, uses nonlinear embeddings to '\n",
" 'account for the impact of nonlinear data distributions Local normalization '\n",
" 'can often provide more effective measures when the distribution of the data '\n",
" 'is heterogeneous. Other data types such as categorical data, text, temporal, '\n",
" 'and graph data present further challenges. The determination of time-series '\n",
" 'and discrete-sequence similarity measures is closely related because the '\n",
" 'latter can be considered the categorical version of the former. In some '\n",
" 'domains, such as spatial data, it is more natural to talk about distance '\n",
" \"functions']\")\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Y_t5fJLMiii2"
},
"source": [
"### custom QA function\n",
"\n",
"- same as above, but create a function that iterates through a list of questions and nicely formats them + the output"
]
},
{
"cell_type": "code",
"metadata": {
"id": "-gGWMwW7ikoU",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "1a0a6823-7278-4e27-cc53-e57686057eba"
},
"source": [
"from google.colab import files\n",
"import pprint as pp\n",
"\n",
"\n",
"def answer_questions(\n",
" das_pipeline,\n",
" q_list,\n",
" k_search=50,\n",
" num_results_d_disp=1,\n",
" export_txt=True,\n",
" add_text=\"\",\n",
" doc_ext=\".md\",\n",
"):\n",
" # if you wanna just save as markdown change doc_ext=\".md\"\n",
" gc.collect()\n",
" qa_document = []\n",
" global info_queries\n",
" spacer = \"\\n\\n----------------\\n\"\n",
" for question in tqdm(q_list, total=len(q_list), desc=\"answering questions...\"):\n",
" print(spacer)\n",
" question_text = \"Question {} of {}: {}\".format(\n",
" q_list.index(question), len(q_list), question\n",
" )\n",
" pp.pprint(question_text)\n",
" print(\"\\n\")\n",
" this_result = pipe.run(\n",
" query=question, params={\"Retriever\": {\"top_k\": topk_search},\n",
" \"Generator\": {\"top_k\": num_results_d_disp}},\n",
" )\n",
"\n",
" this_reply = clean_output(this_result[\"answers\"][0])\n",
" this_reply = this_reply.replace(\"<n>\", \" \")\n",
" pp.pprint(this_reply, indent=5)\n",
" # log for text file\n",
" qa_document.extend(\n",
" [spacer, \"### \" + question_text + \"\\n\", \"\\nAnswer:\\n\", this_reply + \"\\n\\n\"]\n",
" )\n",
" # log for CSV\n",
" n_row_f = {\n",
" \"query\": question,\n",
" \"response\": this_reply,\n",
" \"query_type\": \"QA\",\n",
" \"doc_group\": course_name,\n",
" \"model_name\": cust_model_name,\n",
" \"context\": \"NA\",\n",
" }\n",
" info_queries = info_queries.append(n_row_f, ignore_index=True)\n",
"\n",
" date_time = datetime.now().strftime(\"%m.%d.%Y_%H-%M\")\n",
" this_outname = (\n",
" remove_string_extras(\n",
" \"Q&A_{}_exp_{}\".format(course_name, add_text)\n",
" + \"_\"\n",
" + cust_model_name\n",
" + \"_\"\n",
" + date_time\n",
" + \"_\"\n",
" )\n",
" + doc_ext\n",
" )\n",
" with open(this_outname, \"w\", encoding=\"utf-8\", errors=\"ignore\") as qa_f:\n",
" qa_f.writelines(qa_document)\n",
" put_in_dropbox(this_outname)\n",
" if export_txt:\n",
" files.download(this_outname)\n",
"\n",
" print(\"Completed QA - \", date_time)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "F3m7lffOkQZu"
},
"source": [
"## List of Questions\n",
"\n",
"- *questions can be added / removed, leave it as a list*"
]
},
{
"cell_type": "code",
"metadata": {
"id": "BC1IykFQkL-H",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "675a024c-4e61-42a4-cf9a-a1ef8ee5082a"
},
"source": [
"QUESTIONS = [\n",
" 'how is the manhattan distance function defined? what are the primary advantages and distadvantages of the manhattan distance function?',\n",
" 'how is the hamming distance function defined? what are the primary advantages and distadvantages of the hamming distance function?',\n",
" 'how is the euclidean distance function defined? what are the primary advantages and distadvantages of the euclidean distance function?',\n",
" 'how is the chebyshev distance function defined? what are the primary advantages and distadvantages of the chebyshev distance function?',\n",
" 'how is the minkowski distance function defined? what are the primary advantages and distadvantages of the minkowski distance function?',\n",
" 'between the manhattan, minkowski, euclidean, chebyshev, and hamming distance functions, do all distance functions report a lower average distance when comparing items of the same group vs. items from different groups? Which distannce functions would result in a higher average distance when comparing items of the same group vs. items from different groups?',\n",
" 'between the manhattan, minkowski, euclidean, chebyshev, and hamming distance functions, which metric would provide, on average, the best separation between groups? Why does it provide the best separation?',\n",
" 'between the manhattan, minkowski, euclidean, chebyshev, and hamming distance functions, why does the <insert answer from question above> metric provide, on average, the best separation between groups?',\n",
" 'what is the range of possible values that a distance function defined by the',\n",
" 'The Manhattan and Euclidean distances are also known as L1 and L2 norms, respectively. In general, what behaviour can we expect about the L1 vs. L2 norms as the dimensionality (i.e. the number of attributes) in the data increases? How would this appear in a dataset?',\n",
" 'what requirements need to be fulfilled for a function to be a metric?',\n",
" 'what are the typical properties of a metric function?',\n",
" 'what are some common counterexamples that prove that a function is not a metric?',\n",
" 'what is the function property called homogeneity? What is the process to prove whether homogeneity applies to a given function?',\n",
" 'what is the function property called translation invariance? What is the process to prove whether translation invariance applies to a given function?',\n",
"]"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "whbQFC5xDQA8"
},
"source": [
"## Run list for Question-Answer\n",
"\n",
"- depending on database size, number of questions, so on, may need to run the QA list and search term list separately "
]
},
{
"cell_type": "code",
"metadata": {
"id": "-jawHqKh8Wms",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"cellView": "form",
"outputId": "665afe2b-bb6c-468a-dd3c-7bd6e3d44048"
},
"source": [
"process_QA_list = True # @param {type:\"boolean\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"id": "477RXydvigC-",
"outputId": "9920d695-4ae4-477d-e5c9-76afe53f3906"
},
"source": [
"process_QA_list = True"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "RkyOXXavkMpx",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000,
"referenced_widgets": [
"724812ffa9704cb98bef6dd2cc1832cf",
"5f497ad6fcb540db8ceac9b072fe4782",
"ec25e2f20b7b4693a38ab1ce762ee4ea",
"bacf27bfc94d47f8821e8c790c877852",
"0d6bbe7f480e46778a1dacf8a110f041",
"b673b94f774743ef9b18773bb68b4718",
"c3d626747bb54178a86537b8b6fcf807",
"345a35e102c144188b64ff73fe597e72",
"5ca8dd47bc4344e8b9251327b914e4d3",
"faa7eab29349446f8b8f29022c3af7dc",
"e5e84104a3a54cfa9be4c21c58cdb3df"
]
},
"outputId": "6a0e54aa-9e84-4ce3-9610-d152b4ce7c59"
},
"source": [
"if process_QA_list:\n",
" answer_questions(\n",
" pipe,\n",
" QUESTIONS,\n",
" add_text=\"main_{}_\".format(questions_version),\n",
" k_search=100,\n",
" doc_ext=\".md\",\n",
" export_txt=download_answers_txtfile,\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "724812ffa9704cb98bef6dd2cc1832cf",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"answering questions...: 0%| | 0/15 [00:00<?, ?it/s]"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"\n",
"----------------\n",
"\n",
"('Question 0 of 15: how is the manhattan distance function defined? what are '\n",
" 'the primary advantages and distadvantages of the manhattan distance '\n",
" 'function?')\n",
"\n",
"\n",
"('distance function to the Manhattan distance and use the representative as '\n",
" 'the local median of the cluster (independently along each dimension). '\n",
" 'Similarity and distance functions are often expressed in closed form (e.g:, '\n",
" 'Euclidean distance), but in some domains, such as time-series data, they are '\n",
" 'defined algorithmically and cannot be expressed inclosed form Distance '\n",
" 'functions are fundamental to the effective design of data mining algorithms, '\n",
" 'because a poor choice in this respect may be very detrimental to the quality '\n",
" 'of the results. Sometimes, data analysts use the Euclidean function as a '\n",
" '\"black box\" without much thought about the overall impact of the overall '\n",
" 'design of Data mining algorithms can be very beneficial to the effectiveness '\n",
" 'of the design of a data mining algorithm, because the selection of the '\n",
" 'distance function will (typically) not belong to the original data set D. In '\n",
" 'this case, the only difference between the generic pseu - docode of Fig: '\n",
" '6.2, and a k-medians variation would be to instantiate the distance Function '\n",
" 'to theManhattan distance and Use the Representative as the Local Median of '\n",
" 'the Cluster (Independently alongeach dimension) A k- medians variation will '\n",
" 'be toInstantiate the Distance Function tothe Manhattan Distance and '\n",
" 'usetheRepresentative as thelocal median ofthe cluster')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 1 of 15: how is the hamming distance function defined? what are the '\n",
" 'primary advantages and distadvantages of the hamming distance function?')\n",
"\n",
"\n",
"('the number of mismatched values ou (xi,xj) = d-8 = 1 2 6(Xi,- X;)? Hamming '\n",
" 'distance is thus equivalent to half the squared Euclidean distance. This is '\n",
" 'because many data mining algorithms use the distance function as a key '\n",
" 'subroutine, and the design of the function directly impacts the quality of '\n",
" 'the results: Distance functions are highly sensitive to the type of thedata, '\n",
" 'the dimensionality of the data, and The Lp-norm is the most common distance '\n",
" 'function used for multidimensional data. 3.2.1.9 Computational '\n",
" 'Considerations A major consideration in thedesign of distance functions is '\n",
" 'the computational complexity: this is because distance function computation '\n",
" 'is often embedded as a subroutine that is used repeatedly in the application '\n",
" 'at hand: If the subroutine is not efficiently implementable the '\n",
" 'applicability becomes more Computational complexity is a major consideration '\n",
" 'for distance functions, and this is due to the fact that distance functions '\n",
" 'are very sensitive to different types of data, such as time-series data, '\n",
" 'they are defined algorithmically and cannot be expressed in closed '\n",
" 'formDistance functions are fundamental to the effective design of datamining '\n",
" 'algorithms, because a poor choice in this respect may be very detrimental to '\n",
" 'the quality the results. CATEGORICAL ATTRIBUTES 99 Euclidean Distance The '\n",
" 'Euclideandistance between Xi and Xj is given as T 6(xi, Xj)= Ilxi 5 Xj ll Xi '\n",
" 'Xi 2xiXj +xjXj 2d S')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 2 of 15: how is the euclidean distance function defined? what are '\n",
" 'the primary advantages and distadvantages of the euclidean distance '\n",
" 'function?')\n",
"\n",
"\n",
"('in some domains, such as time-series data, they are defined algorithmically '\n",
" 'and cannot be expressed in closed form Distance functions are fundamental to '\n",
" 'the effective design of data mining algorithms, because a poor choice in '\n",
" 'this respect may be very detrimental to the quality of the results. '\n",
" 'Conversely; a pair of objects are unlikely to have similar values across '\n",
" 'many attributes, just by chance, unless these attributes were relevant _ '\n",
" 'Interestingly, the Euclidean metric (and Lp-norm in general) achieves '\n",
" 'exactly the opposite effect by using the squared sum of the difference in '\n",
" 'attribute values: As a result; the \"noise\" components from the irrelevant '\n",
" 'attributes dominate the computation and mask the similarity effects of a '\n",
" 'large number of relevant attributes. The Lo-norm provides an extreme example '\n",
" 'of this effect where the dimension with the largest distance value is used. '\n",
" '3.2.1.9 Considerations A major consideration in the design of distance '\n",
" 'functions is the computational complexity: This is because distance function '\n",
" 'computation is often embedded as a subroutine that is used repeatedly in the '\n",
" 'application at hand: If the subroutine is not efficiently implementable the '\n",
" 'applicability becomes more restricted: For example, methods such as ISOMAP '\n",
" 'are computationally expensive and hard to implement for very large data sets '\n",
" 'because these methods scale with at least the square of the data size. The '\n",
" 'generic framework for representative based algorithms with an unspecified '\n",
" 'distance function is illustrated in the pseudocode of Computational Fig: '\n",
" '6.2_')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 3 of 15: how is the chebyshev distance function defined? what are '\n",
" 'the primary advantages and distadvantages of the chebyshev distance '\n",
" 'function?')\n",
"\n",
"\n",
"('in some domains, such as time-series data, they are defined algorithmically '\n",
" 'and cannot be expressed in closed form Distance functions are fundamental to '\n",
" 'the effective design of data mining algorithms, because a poor choice in '\n",
" 'this respect may be very detrimental to the quality of the results. 3.2.1.1 '\n",
" 'Impact of Domain-Specific Relevance In some cases, an analyst may know which '\n",
" 'features are more important than others for a particular application: For '\n",
" 'example, for a credit-scoring application; an attributesuch as salary is '\n",
" 'much more relevant to the design of the distance function than an attribute '\n",
" 'such as gender, though both may have some impact. This metric is a '\n",
" 'generalization of the Euclidean measure, and stretches the distance values '\n",
" 'along the principal components according to their variance. A more '\n",
" 'sophisticated approach; referred to as ISOMAP, uses nonlinear embeddings to '\n",
" 'account for the impact of nonlinear data distributions Local normalization '\n",
" 'can often provide more effective measures when the distribution of the data '\n",
" 'is heterogeneous. Conversely; a pair of objects are unlikely to have similar '\n",
" 'values across many attributes, just by chance, unless these attributes were '\n",
" 'relevant _ Interestingly,the Euclidean metric (and Lp-norm in general) '\n",
" 'achieves exactly the opposite effect by using the squared sum of the '\n",
" 'difference in attribute values: As a result; the \"noise\" components from the '\n",
" 'irrelevant attributes dominate the computation and mask the similarity '\n",
" 'effects of a large number of relevant attributes.')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 4 of 15: how is the minkowski distance function defined? what are '\n",
" 'the primary advantages and distadvantages of the minkowski distance '\n",
" 'function?')\n",
"\n",
"\n",
"('the Euclidean metric (and Lp-norm in general) achieves exactly the opposite '\n",
" 'effect by using the squared sum of the difference in attribute values: As a '\n",
" 'result; the \"noise\" components from the irrelevant attributes dominate the '\n",
" 'computation and mask the similarity effects of a large number of relevant '\n",
" 'attributes. For example, methods such as ISOMAP are computationally '\n",
" 'expensive and hard to implement for very large data sets because these '\n",
" 'methods scale with at least the square of the data size. Conversely; a pair '\n",
" 'of objects are unlikely to have similar values across many attributes, just '\n",
" 'by chance, unless these attributes were relevant _ Interestingly, the '\n",
" 'Euclidea metric ( and Lp -norm in General) achievesexactly theopposite '\n",
" 'effect by use of the squared Sum of thedifference in attributes values:As a '\n",
" 'result, the \" noise\" componentsfrom the irrelevant Attributes dominatethe '\n",
" 'computation and Mask the similarity Effects of a Large Number of relevant '\n",
" 'Attributes. For Example, methodssuch as ISO MAP are computationally '\n",
" 'Expensive and Hard to Implement for Very Large Data Sets because these '\n",
" 'Methods scale with At least the Square of the Data Size. Conversely, a Pair '\n",
" 'of Objects are likely to have Similar Values Across Many Attributes, just By '\n",
" 'Chance, Unless these Attributes were Relevant _ Ironically, the Lp -norm '\n",
" 'ingeneral is suscep- tible to some undesirable effects of increasing '\n",
" 'dimensionality')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 5 of 15: between the manhattan, minkowski, euclidean, chebyshev, '\n",
" 'and hamming distance functions, do all distance functions report a lower '\n",
" 'average distance when comparing items of the same group vs. items from '\n",
" 'different groups? Which distannce functions would result in a higher average '\n",
" 'distance when comparing items of the same group vs. items from different '\n",
" 'groups?')\n",
"\n",
"\n",
"('manhattan, minkowski, euclidean, chebyshev, and hamming distance functions, '\n",
" 'do all distance functions report a lower average distance when comparing '\n",
" 'items of the same group vs. items from different groups? Which distannce '\n",
" 'functions would result in a higher average distance When comparingItems of '\n",
" 'the Same Group vs.Items from Different Groups? Which Distannce Functions '\n",
" 'Would Result in a Higher Average Distance When Comparing Items of the SAME '\n",
" 'GROUP vs. Items from Different groups? It is evident that the rate of '\n",
" 'degradation with increasing p is higher when the dimensionality of the data '\n",
" 'is large. This is the reason that the value of p matters less in lower '\n",
" 'dimensional applications. 68 CHAPTER 3 SIMILARITY AND DISTANCES This '\n",
" 'argument has been used to propose the concept of fractional metrics, for '\n",
" 'which p (0,1). M m=1 (10.43) Due to the stabilizing effect of averaging, '\n",
" 'this measure turns out to be more reliable than is its counterpart (10.42) '\n",
" 'for a single tree. Conversely; a pair of objects are unlikely to have '\n",
" 'similar values across many attributes, just by chance, unless these '\n",
" 'attributes were relevant _ Interestingly, the Euclidean metric (and Lp-norm '\n",
" 'in general) achieves exactly the opposite effect by using the squared sum of '\n",
" 'the difference in attribute values')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 6 of 15: between the manhattan, minkowski, euclidean, chebyshev, '\n",
" 'and hamming distance functions, which metric would provide, on average, the '\n",
" 'best separation between groups? Why does it provide the best separation?')\n",
"\n",
"\n",
"('The Manhattan distance is the \"city block\" driving distance in a region in '\n",
" 'which the streets are arranged as a rectangular grid, such as the Manhattan '\n",
" 'Island of New York City: A nice property of the Euclidean distance is that '\n",
" 'it is rotation-invariant because the straight-line distance between two data '\n",
" 'points does not change with the orientation of the axis system. This '\n",
" 'property also means that transformations; such as PCA, SVD, or the wavelet '\n",
" 'transformation for time series (discussed in Chap. Figure 3.2b is derived '\n",
" 'from Fig: 3.2a, except that the results show the fraction of the Manhattan '\n",
" 'performance achieved by higher order norms It is evident that the rate of '\n",
" 'degradation with increasing p is higher when the dimensionality of the data '\n",
" 'is large. The smaller the DB value the better the clustering; since it means '\n",
" 'that the clusters are well separated (i.e,, the distance between cluster '\n",
" 'means is large), and each cluster is well represented by its mean ( i.e.= '\n",
" 'has a small spread) _ Silhouette Coefficient The silhouette coefficient is a '\n",
" 'measure of both cohesion and separation of clusters, and is based on the '\n",
" 'difference between the average distance to points in the closest cluster and '\n",
" 'topoints in the same cluster. This section will therefore study each of '\n",
" 'these types separately: 3.2.1 Quantitative Data')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 7 of 15: between the manhattan, minkowski, euclidean, chebyshev, '\n",
" 'and hamming distance functions, why does the <insert answer from question '\n",
" 'above> metric provide, on average, the best separation between groups?')\n",
"\n",
"\n",
"('Because of the balanced nature of the data, all initial splits on %1 O1 82 '\n",
" 'appear to be useless; and the procedure essentially gener- ates & random '\n",
" 'split at the top level: The actual split found for these data is shown in '\n",
" 'the left panel of Figure 8.13. The smaller the DB value the better the '\n",
" 'clustering; since it means that the clusters are well separated (i.e,, the '\n",
" 'distance between cluster means is large), and each cluster is well '\n",
" 'represented by its mean ( i.e.= has a small spread) _ Silhouette Coefficient '\n",
" 'The silhouette coefficient is a measure of both cohesion and separation of '\n",
" 'clusters, and is based on the difference between the average distance to '\n",
" 'points in the closest cluster and to pointsin the same cluster. Another '\n",
" 'method for detecting the incorrect entries is to use domain-specific '\n",
" 'knowledge about what is already known about the data: For example, if a '\n",
" \"person's height is listed as 6 m, the silhouette coefficient can be used to \"\n",
" 'find the best split on either feature, and then splits the resulting strata: '\n",
" 'The larger p, the more large deviations in one dimension matter. For p - 0, '\n",
" 'the Minkowski converges to the Chebyshev distance For p Z 1,the Minkowski '\n",
" 'distance is a metric.')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 8 of 15: what is the range of possible values that a distance '\n",
" 'function defined by the')\n",
"\n",
"\n",
"('the range of possible values that a distance function defined by the '\n",
" 'context: The quantiles of the Gaussian distribu- tion are used to determine '\n",
" 'the boundaries of the intervals. The values are discretized into a small '\n",
" 'number (typically 3 to 10) of intervals for the best results. Each such '\n",
" 'equi-depth inter- val is mapped to a symbolic value. This chap- ter will, '\n",
" 'therefore, use either of the terms \"distance function\" and \"similarity '\n",
" 'function 75 depending on the domain at hand. In high-dimensional domains '\n",
" 'such as text, similarity functions such as the cosine measure (discussed in '\n",
" 'Sect. 3.3), tend to emphasize the cumulative effect of matches 0n many '\n",
" 'attribute values rather than large distances along individual attributes. '\n",
" 'Similarity and distance functions are often expressed in closed form (e.g:, '\n",
" 'Euclidean distance), but in some domains, such as time-series data, they are '\n",
" 'defined algorithmically and cannot be expressed inclosed form Distance '\n",
" 'functions are fundamental to the effective design of data mining algorithms, '\n",
" 'because a poor choice of algorithm is not the only factor in determining the '\n",
" 'proper value of p. One way of de-emphasizing precise levels of dissimilarity '\n",
" 'is to use prorimity thresh- olding in a dimensionality-sensitive way')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 9 of 15: The Manhattan and Euclidean distances are also known as L1 '\n",
" 'and L2 norms, respectively. In general, what behaviour can we expect about '\n",
" 'the L1 vs. L2 norms as the dimensionality (i.e. the number of attributes) in '\n",
" 'the data increases? How would this appear in a dataset?')\n",
"\n",
"\n",
"('Lp-norm in general) achieves exactly the opposite effect by using the '\n",
" 'squared sum of the difference in attribute values: As a result; the \"noise\" '\n",
" 'components from the irrelevant attributes dominate the computation and mask '\n",
" 'the similarity effects of a large number of relevant attributes. The Lo-norm '\n",
" 'provides an extreme example of this effect where the dimension with the '\n",
" 'largest distance value is used. For in- stance, at level 2 the inequality is '\n",
" 'Z, which implies that if Y is any itemset at this level, we will obtain a '\n",
" 'lower bound. The signs at different levels indicate the coef- ficient of the '\n",
" 'corresponding itemset in the upper Or lower bound computations via (9.3) and '\n",
" '(9.4). 3.5.1 Similarity between Two Nodes in a Single Graph Let G = (N, A) '\n",
" 'be an undirected network with node set N and edge set A. In some domains, '\n",
" 'costs are associated with nodes, whereas in others, weights areassociated '\n",
" 'with nodes. For example, in domains such as bibliographic networks, the '\n",
" 'edges are naturally weighted; and in road networks; the edges naturally have '\n",
" \"costs. Therefore; it may be assumed that either the cost Cij' or the weight \"\n",
" 'Wij of the edge (i,j) is specified. 3.2.1.6 Impact of Data Distribution')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 10 of 15: what requirements need to be fulfilled for a function to '\n",
" 'be a metric?')\n",
"\n",
"\n",
"('minimum support requirement are often referred to as frequent patterns, or '\n",
" 'frequent itersets. Error measures for ordinal variables are generally '\n",
" 'defined by replacing their M original values with 2 =1/2 1 = 1, M (14.23) M '\n",
" 'in the prescribed order of their original values. With unordered categorical '\n",
" '(also called nominal) variables, the degree-of-difference between pairs of '\n",
" 'values must be delineated explicitly. For example, a request might be the '\n",
" 'following: Display all transactions in which ice skates are the consequent '\n",
" 'that have confidence over 80% and gupport of more than 2%. Focusing on a '\n",
" 'particular consequent casts the problem into the framework of supervised '\n",
" 'learning: Association rules have become a popular tool for analyzing very '\n",
" 'large commercial data bases in settings where market basket is relevant. '\n",
" 'These measures often lead to generation of more interesting rules from a '\n",
" 'statistical perspective. More specifically, one might request such a list '\n",
" 'conditioned on particular items in the antecedent or especially the '\n",
" 'consequent. In most cases, the class labels have a clear semantic '\n",
" 'interpretation in the context of a specific application, such as a group of '\n",
" 'customers interested in a specific product, or agroup of data objects with a '\n",
" 'desired property of interest. This could provide information on those items '\n",
" '(antecedent_ that predicate sales of ice skates. Focusingon a '\n",
" 'particularnywayanyday')\n",
"\n",
"\n",
"----------------\n",
"\n",
"'Question 11 of 15: what are the typical properties of a metric function?'\n",
"\n",
"\n",
"('a pair of objects are unlikely to have similar values across many '\n",
" 'attributes, just by chance, unless these attributes were relevant _ '\n",
" 'Interestingly, the Euclidean metric (and Lp-norm in general) achieves '\n",
" 'exactly the opposite effect by using the squared sum of the difference in '\n",
" 'attribute values: As a result; the \"noise\" components from the irrelevant '\n",
" 'attributes dominate the bibliographic and mask the similarity effects of a '\n",
" 'large number of relevant attributes. The Lo-norm provides an extreme example '\n",
" 'of this effect where the computation with the largest distance value is '\n",
" 'used. In some cases, the contextual attribute might be a logical location, '\n",
" 'such as a building or a state_ In the case of spatiotemporal data; the '\n",
" 'contextual attributes may include time. Calculus of Variations We can think '\n",
" 'of a function y(w) as being anOperator that; for any input value %, returns '\n",
" 'an output value y: In the same way we can define a functional Fly] to be an '\n",
" 'operator that takes a Function y(x) and returns anoutput value F An example '\n",
" 'of a functional is the length of a curve drawn in a tWo-dimensional plane in '\n",
" 'which the path of the curve is defined in terms of a Function:In the context '\n",
" 'of machine learning; a widely used Functional is the entropy H[x] for a '\n",
" 'continuous variable x because, for any choice of probability density '\n",
" 'function p(x), it returns a scalar value representing the entropy of x under '\n",
" 'that')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 12 of 15: what are some common counterexamples that prove that a '\n",
" 'function is not a metric?')\n",
"\n",
"\n",
"('Reproducibility: If our state of knowledge about two problems are the same, '\n",
" 'then we must assign the same degree of plausibility to both of them: The '\n",
" 'Cox-Jaynes theorem proves these plausibilities to be sufficient to define '\n",
" 'the universal mathematical rules that apply to plausible p, up to '\n",
" 'transformation by an arbitrary monotonic function. Crucially these rules are '\n",
" 'the rules of probability Remark In machine learning and statistics, there '\n",
" 'are two major interpre- tations of probability: the Bayesian and frequentist '\n",
" 'interpretations (Bishop, 2006, Efron and Hastie, 2016). These notions are '\n",
" 'also closely related to the notion of regularization (e.g: Tikhonov (1943))_ '\n",
" 'We will elaborate on regularization in the second part of this book: The '\n",
" 'notion of consistency of estimators dates back to Fisher (1922). Our pre '\n",
" 'sentation of consistency follows Steinwart & Christmann, who also derived '\n",
" \"several no-free-lunch theorems. H' contains pairs of hypotheses that agree \"\n",
" \"on C' and differ 0n C1= Using this definition, it is clear that if H' \"\n",
" \"shatters a set B C' then it also shatters the set BUc1 and vice versa. For \"\n",
" 'example, a distance-based clustering algorithm may group unre- lated data '\n",
" 'points because the distance function may poorly reflect the intrinsic '\n",
" 'semantic distances between data points with increasing dimensionality')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 13 of 15: what is the function property called homogeneity? What is '\n",
" 'the process to prove whether homogeneity applies to a given function?')\n",
"\n",
"\n",
"('A key property of (-diversity is that any generalization of an 0-diverse '\n",
" 'table is also Q-diverse. Then, the equivalence class is said to satisfy '\n",
" 't-closeness, if the following is true, for an appropriately chosen distance '\n",
" 'function Distlh ): Dist(P,Q) t 20.12 An anonymized table is said that all '\n",
" 'equivalence classes in it satisfies t.closeness. The previous definition '\n",
" 'does not specify any particular distance function: There are many different '\n",
" 'ways to instantiate the distance function, depending on application-specific '\n",
" 'goals. Similarity and distance functions are often expressed in closed form '\n",
" '(e.g:, Euclidean distance), but in some domains, such as time-series data, '\n",
" 'they are defined algorithmically and cannot be expressed inclosed form '\n",
" 'Distance functions are fundamental to the effective design of data mining '\n",
" 'algorithms, because a poor choice in this respect may be very detrimental to '\n",
" 'the quality of the results. Sometimes, data analysts use the Euclidean '\n",
" 'function as a \"black box\" without much thought about the overall impact of '\n",
" 'such a choice. Therefore, evolutionary algorithms are set up to repeat the '\n",
" 'process of selection; crossover, and mutation to improve the fitness '\n",
" '(objective) function value. As theprocess of evolution progresses, all the '\n",
" 'individuals in the population typically improve in fitness and also become '\n",
" 'more similar to each other')\n",
"\n",
"\n",
"----------------\n",
"\n",
"('Question 14 of 15: what is the function property called translation '\n",
" 'invariance? What is the process to prove whether translation invariance '\n",
" 'applies to a given function?')\n",
"\n",
"\n",
"('a number of other dynamic programming methods are commonly used: 3.4.2.1 '\n",
" 'Edit Distance The edit distance defines the distance between two strings as '\n",
" 'the least amount of \"effort\" (o1 cost) required to transform one sequence '\n",
" 'into another by using a series of transformation operations, referred to as '\n",
" '\"edits.\" the edit distance is also referred to to as the Levenshtein '\n",
" 'distance. Automatic differentiation applies aseries of elementary arithmetic '\n",
" 'Automatic operations, e.g.,, addition and multiplication and elementary '\n",
" 'functions, differentiation is different from e. g., sin, COS, exp, log: By '\n",
" 'applying the chain rule to these operations, the symbolic gradient of quite '\n",
" 'complicated functions can be computed automatically differentiation '\n",
" 'andAutomatic differentiation applies to general nonlinear dataDistribution '\n",
" 'can often provide more effective measures when the distribution of the data '\n",
" 'is heterogeneous. In such a case, the edit function is symmetric because it '\n",
" 'does not matter which of the two strings is edited to the other ; a reverse '\n",
" 'sequence of edits, with the same cost, will exist from the other string to '\n",
" 'the first.The edit distance can be extended to numeric data by changing the '\n",
" 'primitive operations of insert, delete, and replace to transformation rules '\n",
" 'that are designed for time series. This metric is a generalization of the '\n",
" 'Euclidean measure, and stretches the distance values along the principal '\n",
" 'components according to their variance')\n",
"Completed QA - 10.13.2021_00-23\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "e46AMZxiVIWr"
},
"source": [
"### Question Set 2"
]
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "lmXYH65XuyOE",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "d1174b08-e186-4763-96ad-6bbae859e3ef"
},
"source": [
"qset_key_term = \"linear reg\" # @param {type:\"string\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "bd9bfNamgBJV",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "3a09eb5c-c742-4786-9375-c13e7f10eda2"
},
"source": [
"question_set_2 = [\n",
"\n",
"]"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "E4WgoL7WgDl8",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "de4bf9d5-4ac5-4095-cc3d-f21b0ebaf0d5"
},
"source": [
"if len(question_set_2) > 0 and process_QA_list:\n",
" answer_questions(\n",
" pipe,\n",
" question_set_2,\n",
" add_text=\"addtl_{}_[topic={}]\".format(questions_version, qset_key_term),\n",
" k_search=100,\n",
" doc_ext=\".md\",\n",
" export_txt=download_answers_txtfile,\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "X4Y4XecAvjhF"
},
"source": [
"### Question Set 3"
]
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "crA947z7vjhG",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "7746dcb9-5337-4db8-ab20-ede57d520fa6"
},
"source": [
"qset_key_term = \"classification\" # @param {type:\"string\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "TSaiF3KpvjhG",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "2ba05702-f352-43f9-fa81-9872a6412c06"
},
"source": [
"question_set_3 = [\n",
"\n",
"]"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "CsJ-CsFevjhH",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "5023a305-cfba-4b54-9bd6-d5206feb3789"
},
"source": [
"if len(question_set_3) > 0 and process_QA_list:\n",
" answer_questions(\n",
" pipe,\n",
" question_set_3,\n",
" add_text=\"addtl_{}_[topic={}]\".format(questions_version, qset_key_term),\n",
" k_search=100,\n",
" doc_ext=\".md\",\n",
" export_txt=download_answers_txtfile,\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "27J0WZo9vpY1"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Tib_vY_9vpYz"
},
"source": [
"### Question Set 4"
]
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "osF53wLCvpY0",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "50eadcb1-d0c9-4a90-807c-b6c85d41e352"
},
"source": [
"qset_key_term = \"unsupervised\" # @param {type:\"string\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "uSADAjGEvpY0",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "b4005c88-cd2c-42dd-d4dc-fb394ce40e65"
},
"source": [
"question_set_4 = [\n",
" \n",
"]"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "H8L3AaIwvpY0",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "d92de17e-c88b-4a86-af29-d56ee859fc25"
},
"source": [
"if len(question_set_4) > 0 and process_QA_list:\n",
" answer_questions(\n",
" pipe,\n",
" question_set_4,\n",
" add_text=\"addtl_{}_[topic={}]\".format(questions_version, qset_key_term),\n",
" k_search=100,\n",
" doc_ext=\".md\",\n",
" export_txt=download_answers_txtfile,\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "iZSEr6edwHf7"
},
"source": [
"### Question Set 5"
]
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "OoyNTFHrwHf8",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "85abd6c1-a78e-48c2-967f-1e82f79fc098"
},
"source": [
"qset_key_term = \"neural networks\" # @param {type:\"string\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "3UU6gg-BwHf8",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "347e03d1-8687-4734-a164-7c6b819ccd68"
},
"source": [
"question_set_5 = [\n",
" \n",
"]"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "F7HEoE00wHf8",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "31303416-74be-4885-d8d8-5acc847018fc"
},
"source": [
"if len(question_set_5) > 0 and process_QA_list:\n",
" answer_questions(\n",
" pipe,\n",
" question_set_5,\n",
" add_text=\"addtl_{}_[topic={}]\".format(questions_version, qset_key_term),\n",
" k_search=100,\n",
" doc_ext=\".md\",\n",
" export_txt=download_answers_txtfile,\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "W-UbuGA0wMIn"
},
"source": [
"### Question Set 6"
]
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "AjwK8wqVwMIo",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "a872472c-8ed1-4851-8918-07c74bcad12f"
},
"source": [
"qset_key_term = \"add stuff\" # @param {type:\"string\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "XlK4e27RwMIo",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "a3404961-f78e-4b91-c1e8-7b58bde5bd5d"
},
"source": [
"question_set_6 = [\n",
"\n",
"]"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ejUxlK7NwMIo",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "9e695f9e-c1ad-431f-82aa-55da3e50e953"
},
"source": [
"if len(question_set_6) > 0 and process_QA_list:\n",
" answer_questions(\n",
" pipe,\n",
" question_set_6,\n",
" add_text=\"addtl_{}_[topic={}]\".format(questions_version, qset_key_term),\n",
" k_search=100,\n",
" doc_ext=\".md\",\n",
" export_txt=download_answers_txtfile,\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OliWlJbQDzmq"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "99rbio1E3KE_"
},
"source": [
"# <center> Relevant Document Search </center>\n",
"\n",
"- enter a topic, and the retriever will get you the best N results to check out for your topic\n",
"- uses `DocumentSearchPipeline`, so searches the texts in the corpus but only completes *extractive* summarization\n",
"- **Use case:** \"I want to know where a concept or term shows up in the course documents\"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "QUYHxaX6oHSz",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "45f4a312-ee7c-4e80-9cbe-904dcb430d8c"
},
"source": [
"from haystack.pipeline import DocumentSearchPipeline\n",
"\n",
"# del retriever\n",
"# re-using from earlier\n",
"# retriever_search = retriever\n",
"\n",
"search_pipe = DocumentSearchPipeline(retriever)\n",
"\n",
"print(\"the URL containing text documents in search is: \\n\", URL_to_archive)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"the URL containing text documents in search is: \n",
" https://www.dropbox.com/sh/xioggpj5h2lqrt9/AAD865bEus7OaI-Q87I8muS1a?dl=1\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "E282IyIj_yCD"
},
"source": [
"## single query\n",
"\n",
"**note** that the below uses the `print_documents()` function, a dictionary with a lot of other metadata is returned (*if you want to save/customize the response, just save the function output to a var*)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "nVdzO2gi_zsR",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"cellView": "form",
"outputId": "dabeda14-f0e5-405d-b064-c9983937a1dd"
},
"source": [
"rd_query = \"distance metric\" # @param {type:\"string\"}\n",
"num_results = 3 # @param {type:\"integer\"}\n",
"search_result = search_pipe.run(query=rd_query, \n",
" params={\"retriever\": {\"top_k\": num_results}},)\n",
"print_documents(search_result, max_text_len=512)\n",
"\n",
"# print(search_result)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Query: distance metric\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'The Manhattan distance is the \"city block\" driving distance in a '\n",
" 'region in which the streets are arranged as a rectangular grid, '\n",
" 'such as the Manhattan Island of New York City: A nice property of '\n",
" 'the Euclidean distance is that it is rotation-invariant because '\n",
" 'the straight-line distance between two data points does not '\n",
" 'change with the orientation of the axis system. This property '\n",
" 'also means that transformations; such as PCA, SVD, or the '\n",
" 'wavelet transformation for time series (discussed in Chap.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Among the three data points A, B, and C, which pair are the '\n",
" 'closest to one another? At first sight; it would seem that data '\n",
" 'points A and B are the closest on the basis of Euclidean '\n",
" 'distance. However, the global data distribution tells us '\n",
" 'otherwise. One way of understanding distances is as the shortest '\n",
" 'length of the path from one data point to another, when using '\n",
" 'only point-to-point jumps from data points to one of their '\n",
" 'k-nearest neighbors based on a standard metric\\n'\n",
" '3.2.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Therefore, the overall sum of the point-to-point jumps reflects '\n",
" 'the aggregate change (distance) from one point to another '\n",
" '(distant) point more accurately than a straight-line distance '\n",
" 'between the points. Such distances are referred to as geodesic '\n",
" 'distances: In the case of Fig: 3.4, the only way to walk from A '\n",
" 'to B with short point-to-point jumps is to walk along the entire '\n",
" 'elliptical shape of the data distribution while passing C along '\n",
" 'the way: Therefore, A and B are actually the farthest pair of '\n",
" 'data point...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Conversely; a pair of objects are unlikely to have similar values '\n",
" 'across many attributes, just by chance, unless these attributes '\n",
" 'were relevant _ Interestingly, the Euclidean metric (and Lp-norm '\n",
" 'in general) achieves exactly the opposite effect by using the '\n",
" 'squared sum of the difference in attribute values: As a result; '\n",
" 'the \"noise\" components from the irrelevant attributes dominate '\n",
" 'the computation and mask the similarity effects of a large number '\n",
" 'of relevant attributes. The Lo-norm provides an extreme exampl...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'MULTIDIMENSIONAL DATA 71 0.6 POINT B POINT C POINT A 0.4 0.8 0.2 '\n",
" 'POINT A 0.6 POINT C 0.4 0.2 0.2 POINT B 0.4 0.6 0.2 1.5 0.6 0.5 '\n",
" '0.5 0.2 -0.2 -0.4 -0.6 -0.5 -0.5 55 (a) A and C seem close '\n",
" '(original data) (b) A and C are actually far away (ISOMAP '\n",
" 'embedding) Figure 3.5: Impact of ISOMAP embedding on distances '\n",
" 'such as the Euclidean measure. The intuitive rationale for this '\n",
" 'is that only short point- to-point jumps can accurately measure '\n",
" 'minor changes in the generative process for that point.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'This metric is a generalization of the Euclidean measure, and '\n",
" 'stretches the distance values along the principal components '\n",
" 'according to their variance. A more sophisticated approach; '\n",
" 'referred to as ISOMAP, uses nonlinear embeddings to account for '\n",
" 'the impact of nonlinear data distributions Local normalization '\n",
" 'can often provide more effective measures when the distribution '\n",
" 'of the data is heterogeneous. Other data types such as '\n",
" 'categorical data, text, temporal, and graph data present further '\n",
" 'challenges. The de...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'This section will therefore study each of these types separately: '\n",
" '3.2.1 Quantitative Data The most common distance function for '\n",
" 'quantitative data is the Lp-norm: The Lp-norm between two data '\n",
" 'points X = (11 84) and Y = (y1 Ya) is defined as follows: 1/p '\n",
" 'Dist(X,Y) = Ixci - yi|p i=1 (3.1) Two special cases of the '\n",
" 'Lp-norm are the Euclidean (p 2) and the Manhattan (p = 1) '\n",
" 'metrics These special cases derive their intuition from spatial '\n",
" 'applications where they have clear physical interpretability The '\n",
" 'Euclidean di...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Fractional met- rics were proposed in [17] and generally provide '\n",
" 'more accurate results than the Manhattan and Euclidean metric. '\n",
" 'The ISOMAP method discussed in this chapter was proposed in '\n",
" '[490]. Numerous local methods are also possible for distance '\n",
" 'function computation: An example of an effective local method is '\n",
" 'the instance-based method proposed in [543]. Similarity in '\n",
" 'categorical data was explored extensively in [104]. In this work; '\n",
" 'a number of similarity measures were analyzed; and how they apply '\n",
" 'to the ...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '66 CHAPTER 3 SIMILARITY AND DISTANCES To better understand the '\n",
" 'impact of the dimensionality curse on distances; let US examine a '\n",
" 'unit cube of dimensionality d that is fully located in the '\n",
" 'nonnegative quadrant, with one corner at the origin 0. What is '\n",
" 'the Manhattan distance of the corner of this cube (say; at the '\n",
" 'origin) to a randomly chosen point X inside the cube?'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'In high-dimensional domains such as text, similarity functions '\n",
" 'such as the cosine measure (discussed in Sect. 3.3), tend to '\n",
" 'emphasize the cumulative effect of matches 0n many attribute '\n",
" 'values rather than large distances along individual attributes. '\n",
" 'This general principle can also be used for quantitative data. '\n",
" 'One way of de-emphasizing precise levels of dissimilarity is to '\n",
" 'use prorimity thresh- olding in a dimensionality-sensitive way: '\n",
" 'To perform proximity thresholding; the data are discretized into '\n",
" 'equide...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_20_chapter_17_clustering_validation_TB.txt',\n",
" 'text': 'The smaller the DB value the better the clustering; since it '\n",
" 'means that the clusters are well separated (i.e,, the distance '\n",
" 'between cluster means is large) , and each cluster is well '\n",
" 'represented by its mean (i.e.= has a small spread) _ Silhouette '\n",
" 'Coefficient The silhouette coefficient is a measure of both '\n",
" 'cohesion and separation of clusters, and is based on the '\n",
" 'difference between the average distance to points in the closest '\n",
" 'cluster and to points in the same cluster.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '3.5.1 Similarity between Two Nodes in a Single Graph Let G = (N, '\n",
" 'A) be an undirected network with node set N and edge set A. In '\n",
" 'some domains, costs are associated with nodes, whereas in others, '\n",
" 'weights are associated with nodes. For example, in domains such '\n",
" 'as bibliographic networks, the edges are naturally weighted; and '\n",
" 'in road networks, the edges naturally have costs. Typically; '\n",
" 'distance functions work with costs, whereas similarity functions '\n",
" 'work with weights. Therefore; it may be assumed that either '\n",
" 'the...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Similarity and distance functions are often expressed in closed '\n",
" 'form (e.g:, Euclidean distance) , but in some domains, such as '\n",
" 'time-series data, they are defined algorithmically and cannot be '\n",
" 'expressed in closed form Distance functions are fundamental to '\n",
" 'the effective design of data mining algorithms, because a poor '\n",
" 'choice in this respect may be very detrimental to the quality of '\n",
" 'the results. Sometimes, data analysts use the Euclidean function '\n",
" 'as a \"black box\" without much thought about the overall impact '\n",
" 'o...'}\n",
"\n",
"{ 'name': 'ESL__1412_support_vector_machines_and_flexible_discriminant_.txt',\n",
" 'text': 'A new observation is classified to the class with closest '\n",
" 'centroid. A slight twist is that distance is measured in the '\n",
" 'Mahalanobis metric; using a pooled covariance estimate. 12.4 '\n",
" 'Generalizing Linear Discriminant Analysis 439 LDA is the '\n",
" 'estimated Bayes classifier if the observations are multi- variate '\n",
" 'Gaussian in each class, with a common covariance matrix: Since '\n",
" 'this assumption is unlikely to be true, this might not seem to be '\n",
" 'much of a virtue. The decision boundaries created by LDA are '\n",
" 'linear , leading t...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_2116_mining_spatial_data_TB.txt',\n",
" 'text': 'Chapter 16 Mining Spatial Data 6 Time and space are modes by '\n",
" 'which we think and not conditions in which we live. Albert '\n",
" 'Einstein 16.1 Introduction Spatial data arises commonly in '\n",
" 'geographical data mining applications Numerous appli- cations '\n",
" 'related to meteorological data; earth science; image analysis, '\n",
" 'and vehicle data are spatial in nature: In many cases, spatial '\n",
" 'data is integrated with temporal components. Such data is '\n",
" 'referred to as spatiotemporal data.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1510_data_classification_TB.txt',\n",
" 'text': 'This is a very special example where only a 1- dimensional '\n",
" 'projection works well. However , it may not be generalizable to '\n",
" 'an arbitrary data set_ A more general way of computing the '\n",
" 'distances in a class-sensitive way; is to use a soft weighting of '\n",
" 'different directions, rather than selecting specific dimensions '\n",
" 'in a hard way This can be achieved with the use of an appropriate '\n",
" 'choice of matrix A in Eq. 10.71.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '3.2.1.9 Computational Considerations A major consideration in the '\n",
" 'design of distance functions is the computational complexity: '\n",
" 'This is because distance function computation is often embedded '\n",
" 'as a subroutine that is used repeatedly in the application at '\n",
" 'hand: If the subroutine is not efficiently implementable the '\n",
" 'applicability becomes more restricted: For example, methods such '\n",
" 'as ISOMAP are computationally expensive and hard to implement for '\n",
" 'very large data sets because these methods scale with at least '\n",
" 'th...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_72_data_preparation_TB.txt',\n",
" 'text': 'A larger number of data points typically requires a larger '\n",
" 'dimensionality of representation to achieve the same stress. The '\n",
" 'most crucial element is, however, the inherent structure of the '\n",
" 'distance matrix: For example, if a 10,000 x 10,000 distance '\n",
" 'matrix contains the pairwise driving distance between 10,000 '\n",
" 'cities, it can usually be approximated quite well with just a '\n",
" '2-dimensional representation_ This is because driving distances '\n",
" 'are an approximation of Euclidean distances in 2-dimensional '\n",
" 'space. On the ...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_17_chapter_14_hierarchical_clustering_TB.txt',\n",
" 'text': 'Several distance measures, such as single link, complete link, '\n",
" 'group average, and others discussed below, can be used to '\n",
" 'compute the distance between any two clusters. The between '\n",
" 'cluster distances are ultimately based on the distance between '\n",
" 'two points, which is typically computed using the Euclidean '\n",
" \"distance O1 L2-norm, defined as 1/2 f(x,y) = Ilx -yll2 = '(ci - \"\n",
" 'yi)? i=1 However , one may use other distance metrics, or if '\n",
" 'available one may a user-specified distance matrix: DRAFT '\n",
" '2013-07-10 11:07. Pleas...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Similarity graph-based methods are almost always locality '\n",
" 'sensitive because of their local focus on the k-nearest neighbor '\n",
" 'distribution: Generic Methods: In generic local distance '\n",
" 'computation methods, the idea is to divide the space into a set '\n",
" 'of local regions. The distances are then adjusted in each region '\n",
" 'using the local statistics of this region. Therefore, the broad '\n",
" 'approach is as follows: 1 Partition the data into a set of local '\n",
" 'regions.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1914_mining_time_series_data_TB.txt',\n",
" 'text': 'Specifically; the Euclidean distance of a data point to its '\n",
" 'kth-nearest neighbors is used to define the outlier score.'}\n",
"\n",
"{ 'name': 'TB-v_mm_lt_bc_53_analytic_geometry_TB.txt',\n",
" 'text': ') Then d(x,y) := Ilx - yll (1 5 Y,x - y) (3.21) is called the '\n",
" 'distance between x and y for &,y € V. If we use the dot distance '\n",
" 'product as the inner product; then the distance is called '\n",
" 'Euclidean distance. Euclidean distance 02021 . 76 Analytic '\n",
" 'Geometry The mapping d :V xV _ R (1,y) + d(, y) (3.22) (3.23) '\n",
" 'metric is called a metric.'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_20_chapter_17_clustering_validation_TB.txt',\n",
" 'text': 'Examples of distance functions include normal- ized mutual '\n",
" 'information, variation of information, and conditional entropy '\n",
" '(which is asymmetric). Examples of similarity functions include '\n",
" 'Jaccard, Fowlkes-Mallows; Hubert T statistic, and s0 on_ 0.9 0.8 '\n",
" '1 0.7 0.6 0.5 ] 0.4 0.3 0.2 0.1 ps (k) FM pa(k:) : VI 4 3 4 2 5 k '\n",
" '6 8 9 Figure 17.6: Clustering Stability: Iris Dataset Example '\n",
" '17.10: We study the clustering stability for the Iris principal '\n",
" 'compo nents dataset, with n = 150, using the K-means algorithm. '\n",
" 'We u...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Therefore, this section will address both data types, and each '\n",
" 'similarity measure will be discussed in a subsection on either '\n",
" 'continuous series or discrete series, based on its most common '\n",
" 'use For some measures, the usage is common across both data '\n",
" 'types_ 3.4.1 Time-Series Similarity Measures The design of '\n",
" 'time-series similarity measures is highly application specific. '\n",
" 'For example, the simplest possible similarity measure between two '\n",
" 'time series of equal length is the Euclidean metric. Although '\n",
" 'such a metri...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Such fractional metrics can provide more effective results for '\n",
" 'the high-dimensional case. As a rule of thumb, the larger the '\n",
" 'dimensionality; the lower the value of p. However, no exact rule '\n",
" 'exists on the precise choice of p because dimensionality is not '\n",
" 'the only factor in determining the proper value of p. The '\n",
" 'precise choice of p should be selected in an '\n",
" 'application-specific way, with the use of benchmarking: The '\n",
" 'bibliographic notes contain discussions on the use of fractional '\n",
" 'metrics.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'The Mahalanobis distance is similar to the Euclidean distance, '\n",
" 'except that it normalizes the data on the basis of the '\n",
" 'interattribute correlations_ For example, if the axis system were '\n",
" 'to be rotated to the principal directions of the data (shown in '\n",
" 'Fig: 3.3) then the data would have no (second order) '\n",
" 'interattribute correlations.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'In similarity functions, larger values imply greater similarity, '\n",
" 'whereas in distance func- tions, smaller values imply greater '\n",
" 'similarity In some domains, such as spatial data, it is more '\n",
" 'natural to talk about distance functions, whereas in other '\n",
" 'domains; such as text; it is more natural to talk about '\n",
" 'similarity functions. Nevertheless, the principles involved in '\n",
" 'the design of such functions are generally invariant across '\n",
" 'different data domains. This chap- ter will, therefore, use '\n",
" 'either of the terms \"dist...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_10_chapter_7_dimensionality_reduction_TB.txt',\n",
" 'text': 'Feedback is Welcome_ Note that this book shall be available for '\n",
" 'purchase from Cambridge University Press and other standard '\n",
" 'distribution channels, that no unauthorized distribution shall be '\n",
" 'allowed, and that the reader may make one copy only for personal '\n",
" 'on-screen use. CHAPTER 7. DIMENSIONALITY REDUCTION 237 Pearson, K '\n",
" '(1901) , \"On lines and planes of closest fit to systems of points '\n",
" 'in space\" The London, Edinburgh, and Dublin Philosophical '\n",
" 'Magazine and Journal of Sci- ence, 2 (11), pp. 559-572.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '(3.4) Ll This ratio can be interpreted as the distance contrast '\n",
" 'between the different data points; in terms of how different '\n",
" 'the minimum and maximum distances from the origin might be '\n",
" 'considered. Because the contrast reduces with Vd, it means that '\n",
" 'there is virtually no contrast with increasing dimensionality '\n",
" 'Lower contrasts are obviously not desirable because it means that '\n",
" 'the data mining algorithm will score the distances between all '\n",
" 'pairs of data points in approximately the same way and will not '\n",
" 'discr...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Figure 3.2b is derived from Fig: 3.2a, except that the results '\n",
" 'show the fraction of the Manhattan performance achieved by higher '\n",
" 'order norms It is evident that the rate of degradation with '\n",
" 'increasing p is higher when the dimensionality of the data is '\n",
" 'large. For 2-dimensional data, there is very little degradation. '\n",
" 'This is the reason that the value of p matters less in lower '\n",
" 'dimensional applications. 68 CHAPTER 3 SIMILARITY AND DISTANCES '\n",
" 'This argument has been used to propose the concept of fractional '\n",
" 'metric...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1914_mining_time_series_data_TB.txt',\n",
" 'text': 'Other faster methods for approximation exist, such as the use of '\n",
" 'the SAX representation_ When the SAX representation is used, a '\n",
" 'table of precomputed distances can be maintained for all pairs of '\n",
" 'discrete values; and a simple table lookup is required for lower '\n",
" 'bounding: Furthermore, some other time series distance functions '\n",
" 'such as dynamic time warping can also be bounded from below _ The '\n",
" 'bibliographic notes contain pointers to some of these bounds.'}\n",
"\n",
"{ 'name': 'TB-v_mm_lt_bc_116_probability_and_distributions_TB.txt',\n",
" 'text': 'Recall that the probability mass (or density) is posi- tive and '\n",
" 'needs to add up to 1_ These constraints mean that distributions '\n",
" 'live on something called a statistical manifold: The study of '\n",
" 'this space of probability distributions is called information '\n",
" 'geometry Computing dis- tances between distributions are often '\n",
" 'done using Kullback-Leibler diver- gence, which is a '\n",
" 'generalization of distances that account for properties of the '\n",
" 'statistical manifold.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '2) , can be used on the data without affectingl the distance: '\n",
" 'Another interesting special case is that obtained by setting p = '\n",
" 'C The result of this computation is to select the dimension for '\n",
" 'which the two objects are the most distant from one another and '\n",
" 'report the absolute value of this distance: All other features '\n",
" 'are ignored.a The Lp-norm is one of the most popular distance '\n",
" 'functions used by data mining analysts. One of the reasons for '\n",
" 'its popularity is the natural intuitive appeal and '\n",
" 'interpretability o...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'When a one-to-one mapping does exist, many of the '\n",
" 'multidimensional categorical distance measures can be adapted to '\n",
" 'this domain, just as the Lp-norm can be adapted to continuous '\n",
" 'time series. However, the application domains of discrete '\n",
" 'sequence data are most often such that a one-to-one mapping does '\n",
" 'not exist.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1712_mining_data_streams_TB.txt',\n",
" 'text': 'The Gaussian spatial kernel function Khs was used because of its '\n",
" 'well-known effectiveness. Specifically; Kha is the product of d '\n",
" 'identical gaussian kernel functions, and hs (h3 hd) , where hs '\n",
" 'is the smoothing parameter for dimension i_ The velocity '\n",
" 'density is associated with a data point as well as a time '\n",
" 'instant, and therefore this definition allows the labeling of '\n",
" 'both data points and time instants as outliers.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_72_data_preparation_TB.txt',\n",
" 'text': 'This scaling is necessary to ensure 2 i=1 22 that documents of '\n",
" 'varying length are treated in a uniform way: After this scaling; '\n",
" 'traditional numeric measures, such as the Euclidean distance, '\n",
" 'work more effectively. LSA is discussed in Sect. 2.4.3.3 of this '\n",
" 'chapter.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'In the special case where p 2, accurate distance computations are '\n",
" 'obtained with the wavelet representation; if most of the larger '\n",
" 'wavelet coefficients are retained in the representation: In fact, '\n",
" 'it can be shown that if no wavelet coefficients are removed, then '\n",
" 'the distances are identical between the two representations. This '\n",
" 'is because wavelet transformations can be viewed as a rotation of '\n",
" 'an axis system in which each dimension represents a time stamp. '\n",
" 'Euclidean metrics are invariant to axis rotation.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Clearly, A and B are equidistant from the origin according to any '\n",
" 'Lp-norm However , a question arises, as to whether A and B should '\n",
" 'truly be considered equidistant from the origin 0 This is because '\n",
" 'the straight line from 0 to A is aligned with a high-variance '\n",
" 'direction in the data, and statistically, it is more likely for '\n",
" 'data points to be further away in this direction.'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_5_chapter_2_numeric_attributes_TB.txt',\n",
" 'text': 'The Mahalanobis distance is a generalization of Euclidean '\n",
" 'distance, since if we set 2 = I where I is the d X d identity '\n",
" \"matrix (with diagonal elements as 1's and off-diagonal elements \"\n",
" \"as 0's) , we get (xi - p)T I-1 (xi = p) = Ilxi pll? The Euclidean \"\n",
" 'distance thus ignores the covariance information between the '\n",
" 'at- tributes, whereas the Mahalanobis distance explicitly takes '\n",
" 'it into consideration The standard multivariate normal '\n",
" 'distribution has parameters /L = 0 and 2 = I.'}\n",
"\n",
"{ 'name': 'HW20-solution_distance_functions_vector.txt',\n",
" 'text': 'The value d of the metrics were scaled for visualization '\n",
" 'purposes: a) Manhattan = 1, b) Hamming = 100 , Euclidean = no '\n",
" 'change, d) Chebyshev =d * 2, e) Minkowski = d * 2 For example, '\n",
" 'the average distances of talk.religion.misc: talk religion.misc '\n",
" 'and talk.religion.misc: rec.autos have the same value for the '\n",
" 'Minkowski metrics with d = 3 and d = 4. Exercise L.d Another '\n",
" 'similarity measure is also often suggested in the literature, '\n",
" 'viz.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1914_mining_time_series_data_TB.txt',\n",
" 'text': 'The idea is to select points on the Gaussian curve, SO that the '\n",
" 'area between successive breakpoints is equal, and therefore the '\n",
" 'different symbols have approximately the same frequency: 14.2.5 '\n",
" 'Time Series Similarity Measures Time series similarity measures '\n",
" 'are typically designed with application-specific goals in mind: '\n",
" 'The most common methods for time series similarity computation '\n",
" 'are Euclidean distance and dynamic time warping (DTW) The '\n",
" 'Euclidean distance is defined in an iden- tical way to '\n",
" 'multidimensi...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1712_mining_data_streams_TB.txt',\n",
" 'text': 'The squared distance between a data point and its assigned '\n",
" 'cluster center is denoted by dist(Xi, Yji), where the data record '\n",
" 'Xi is assigned to the representative Yji In principle, any '\n",
" 'partitioning algorithm, such as k-means or k-medoids, can be '\n",
" 'applied to the segment Si in order to determine the '\n",
" 'representatives Yi Yk_ For the purpose of discussion, this '\n",
" 'algorithm will be treated as a black box_ After the first segment '\n",
" 'S1 has been processed, we now have a set of k medians that are '\n",
" 'stored away: The number of ...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_138_outlier_analysis_TB.txt',\n",
" 'text': 'This is especially important when the distances between X and Y '\n",
" 'are small, and it will result in greater statistical fluctuations '\n",
" 'in the raw distances. At a conceptual level, it is possible to '\n",
" 'define a version of LOF directly in terms of raw distances, '\n",
" 'rather than reachability distances. However; such a version would '\n",
" 'be missing the stability provided by smoothing: The average '\n",
" 'reachability distance ARK(X of data point X with respect to its '\n",
" 'neigh- borhood Lk(X) is defined as the average of its '\n",
" 'reachability di...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1510_data_classification_TB.txt',\n",
" 'text': 'Even when the data is uncorrelated, the Mahalanobis metric is '\n",
" 'useful because it auto-scales for the naturally different ranges '\n",
" 'of attributes describ ing different physical quantities, such as '\n",
" 'age and salary: Such a scaling ensures that no single attribute '\n",
" 'dominates the distance function_ In cases where the attributes '\n",
" 'are correlated, the Mahalanobis metric accounts well for the '\n",
" 'varying redundancies in different features. How- ever, its major '\n",
" 'weakness is that it does not account for the varying shapes of '\n",
" 'the ...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_116_cluster_analysis_TB.txt',\n",
" 'text': 'Therefore, the individual data points in dense regions are used '\n",
" 'as building blocks after classifying them on the basis of their '\n",
" 'density: The density of a data point is defined by the number of '\n",
" 'points that lie within a radius Eps of that point (including the '\n",
" 'point itself) The densities of these spherical regions are used '\n",
" 'to classify the data points into core, border; O noise points.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'It is important to remember that these normalization issues may '\n",
" 'not be relevant to every application. Some applications may '\n",
" 'require only translation, only scaling, or neither of the two. '\n",
" 'Other applications may require both: In fact; in some cases, the '\n",
" 'wrong choice of\\n'\n",
" '3.4.'}\n",
"\n",
"{ 'name': 'HW20-solution_distance_functions_vector.txt',\n",
" 'text': 'Exercise L.c Which metric seems to provide, on average, the best '\n",
" 'separation between groups? Explain why this is the case: Without '\n",
" 'a clear definition of what constitutes a good separation between '\n",
" 'the groups, we can choose a metric whose values lead to larger '\n",
" 'variability when computed across groups. Both the Manhattan and '\n",
" 'the Hamming distance seem to satisfy this require- ment and, to a '\n",
" 'lesser extent, the Euclidean distance.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1510_data_classification_TB.txt',\n",
" 'text': 'Instead of using the Euclidean distance metric, the distance '\n",
" 'between two d-dimensional points X and Y is defined with respect '\n",
" 'to a d X d matrix A Dist(X,Y) = Vr-Y)A(X 5 Y)T (10.71) This '\n",
" 'distance function is the same as the Euclidean metric when A is '\n",
" 'the identity matrix. Different choices of A can lead to better '\n",
" 'sensitivity of the distance function to the local and global data '\n",
" 'distributions. These different choices will be discussed in '\n",
" 'the following subsections. 10.8.1.1 Unsupervised Mahalanobis '\n",
" 'Metric ...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '82 CHAPTER 3 SIMILARITY AND DISTANCES 3.4.1.4 Window-Based '\n",
" 'Methods The example in Fig: 3.7 illustrates a case where dropped '\n",
" 'readings may cause a gap in the matching: Window-based schemes '\n",
" 'attempt to decompose the two series into windows and then '\n",
" '\"stitch\" together the similarity measure The intuition here is '\n",
" 'that if two series have many contiguous matching segments, they '\n",
" 'should be considered similar: For long time series, a global '\n",
" 'match becomes increasingly unlikely.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'The major problem with Lp-norms is that they are designed for '\n",
" 'time series of equal length and cannot address distortions on the '\n",
" 'temporal (contextual) attributes: 3.4.1.3 Dynamic Time Warping '\n",
" 'Distance DTW stretches the series along the time axis in a '\n",
" 'varying (0 dynamic) way over different portions to enable more '\n",
" 'effective matching: An example of warping is illustrated in Fig: '\n",
" '3.8a, where the two series have very similar shape in segments A '\n",
" 'B, and C, but specific segments in each series need to be '\n",
" 'stretched a...'}\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "3jGW7zwRotZ6",
"cellView": "form",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"outputId": "025191a8-4149-4dd9-de99-2d7ea9313135"
},
"source": [
"rd_query = \"distance function\" # @param {type:\"string\"}\n",
"num_results = 3 # @param {type:\"integer\"}\n",
"search_result = search_pipe.run(query=rd_query, \n",
" params={\"retriever\": {\"top_k\": num_results}},)\n",
"print_documents(search_result, max_text_len=512)\n",
"\n",
"# print(search_result)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Query: distance function\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'This metric is a generalization of the Euclidean measure, and '\n",
" 'stretches the distance values along the principal components '\n",
" 'according to their variance. A more sophisticated approach; '\n",
" 'referred to as ISOMAP, uses nonlinear embeddings to account for '\n",
" 'the impact of nonlinear data distributions Local normalization '\n",
" 'can often provide more effective measures when the distribution '\n",
" 'of the data is heterogeneous. Other data types such as '\n",
" 'categorical data, text, temporal, and graph data present further '\n",
" 'challenges. The de...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_20_chapter_17_clustering_validation_TB.txt',\n",
" 'text': 'Examples of distance functions include normal- ized mutual '\n",
" 'information, variation of information, and conditional entropy '\n",
" '(which is asymmetric). Examples of similarity functions include '\n",
" 'Jaccard, Fowlkes-Mallows; Hubert T statistic, and s0 on_ 0.9 0.8 '\n",
" '1 0.7 0.6 0.5 ] 0.4 0.3 0.2 0.1 ps (k) FM pa(k:) : VI 4 3 4 2 5 k '\n",
" '6 8 9 Figure 17.6: Clustering Stability: Iris Dataset Example '\n",
" '17.10: We study the clustering stability for the Iris principal '\n",
" 'compo nents dataset, with n = 150, using the K-means algorithm. '\n",
" 'We u...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Similarity and distance functions are often expressed in closed '\n",
" 'form (e.g:, Euclidean distance) , but in some domains, such as '\n",
" 'time-series data, they are defined algorithmically and cannot be '\n",
" 'expressed in closed form Distance functions are fundamental to '\n",
" 'the effective design of data mining algorithms, because a poor '\n",
" 'choice in this respect may be very detrimental to the quality of '\n",
" 'the results. Sometimes, data analysts use the Euclidean function '\n",
" 'as a \"black box\" without much thought about the overall impact '\n",
" 'o...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Therefore, the overall sum of the point-to-point jumps reflects '\n",
" 'the aggregate change (distance) from one point to another '\n",
" '(distant) point more accurately than a straight-line distance '\n",
" 'between the points. Such distances are referred to as geodesic '\n",
" 'distances: In the case of Fig: 3.4, the only way to walk from A '\n",
" 'to B with short point-to-point jumps is to walk along the entire '\n",
" 'elliptical shape of the data distribution while passing C along '\n",
" 'the way: Therefore, A and B are actually the farthest pair of '\n",
" 'data point...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'In similarity functions, larger values imply greater similarity, '\n",
" 'whereas in distance func- tions, smaller values imply greater '\n",
" 'similarity In some domains, such as spatial data, it is more '\n",
" 'natural to talk about distance functions, whereas in other '\n",
" 'domains; such as text; it is more natural to talk about '\n",
" 'similarity functions. Nevertheless, the principles involved in '\n",
" 'the design of such functions are generally invariant across '\n",
" 'different data domains. This chap- ter will, therefore, use '\n",
" 'either of the terms \"dist...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Fractional met- rics were proposed in [17] and generally provide '\n",
" 'more accurate results than the Manhattan and Euclidean metric. '\n",
" 'The ISOMAP method discussed in this chapter was proposed in '\n",
" '[490]. Numerous local methods are also possible for distance '\n",
" 'function computation: An example of an effective local method is '\n",
" 'the instance-based method proposed in [543]. Similarity in '\n",
" 'categorical data was explored extensively in [104]. In this work; '\n",
" 'a number of similarity measures were analyzed; and how they apply '\n",
" 'to the ...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '3.2.1.9 Computational Considerations A major consideration in the '\n",
" 'design of distance functions is the computational complexity: '\n",
" 'This is because distance function computation is often embedded '\n",
" 'as a subroutine that is used repeatedly in the application at '\n",
" 'hand: If the subroutine is not efficiently implementable the '\n",
" 'applicability becomes more restricted: For example, methods such '\n",
" 'as ISOMAP are computationally expensive and hard to implement for '\n",
" 'very large data sets because these methods scale with at least '\n",
" 'th...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Similarity graph-based methods are almost always locality '\n",
" 'sensitive because of their local focus on the k-nearest neighbor '\n",
" 'distribution: Generic Methods: In generic local distance '\n",
" 'computation methods, the idea is to divide the space into a set '\n",
" 'of local regions. The distances are then adjusted in each region '\n",
" 'using the local statistics of this region. Therefore, the broad '\n",
" 'approach is as follows: 1 Partition the data into a set of local '\n",
" 'regions.'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_20_chapter_17_clustering_validation_TB.txt',\n",
" 'text': 'The smaller the DB value the better the clustering; since it '\n",
" 'means that the clusters are well separated (i.e,, the distance '\n",
" 'between cluster means is large) , and each cluster is well '\n",
" 'represented by its mean (i.e.= has a small spread) _ Silhouette '\n",
" 'Coefficient The silhouette coefficient is a measure of both '\n",
" 'cohesion and separation of clusters, and is based on the '\n",
" 'difference between the average distance to points in the closest '\n",
" 'cluster and to points in the same cluster.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '66 CHAPTER 3 SIMILARITY AND DISTANCES To better understand the '\n",
" 'impact of the dimensionality curse on distances; let US examine a '\n",
" 'unit cube of dimensionality d that is fully located in the '\n",
" 'nonnegative quadrant, with one corner at the origin 0. What is '\n",
" 'the Manhattan distance of the corner of this cube (say; at the '\n",
" 'origin) to a randomly chosen point X inside the cube?'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Among the three data points A, B, and C, which pair are the '\n",
" 'closest to one another? At first sight; it would seem that data '\n",
" 'points A and B are the closest on the basis of Euclidean '\n",
" 'distance. However, the global data distribution tells us '\n",
" 'otherwise. One way of understanding distances is as the shortest '\n",
" 'length of the path from one data point to another, when using '\n",
" 'only point-to-point jumps from data points to one of their '\n",
" 'k-nearest neighbors based on a standard metric\\n'\n",
" '3.2.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Conversely; a pair of objects are unlikely to have similar values '\n",
" 'across many attributes, just by chance, unless these attributes '\n",
" 'were relevant _ Interestingly, the Euclidean metric (and Lp-norm '\n",
" 'in general) achieves exactly the opposite effect by using the '\n",
" 'squared sum of the difference in attribute values: As a result; '\n",
" 'the \"noise\" components from the irrelevant attributes dominate '\n",
" 'the computation and mask the similarity effects of a large number '\n",
" 'of relevant attributes. The Lo-norm provides an extreme exampl...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'In high-dimensional domains such as text, similarity functions '\n",
" 'such as the cosine measure (discussed in Sect. 3.3), tend to '\n",
" 'emphasize the cumulative effect of matches 0n many attribute '\n",
" 'values rather than large distances along individual attributes. '\n",
" 'This general principle can also be used for quantitative data. '\n",
" 'One way of de-emphasizing precise levels of dissimilarity is to '\n",
" 'use prorimity thresh- olding in a dimensionality-sensitive way: '\n",
" 'To perform proximity thresholding; the data are discretized into '\n",
" 'equide...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '3.5.1 Similarity between Two Nodes in a Single Graph Let G = (N, '\n",
" 'A) be an undirected network with node set N and edge set A. In '\n",
" 'some domains, costs are associated with nodes, whereas in others, '\n",
" 'weights are associated with nodes. For example, in domains such '\n",
" 'as bibliographic networks, the edges are naturally weighted; and '\n",
" 'in road networks, the edges naturally have costs. Typically; '\n",
" 'distance functions work with costs, whereas similarity functions '\n",
" 'work with weights. Therefore; it may be assumed that either '\n",
" 'the...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'MULTIDIMENSIONAL DATA 71 0.6 POINT B POINT C POINT A 0.4 0.8 0.2 '\n",
" 'POINT A 0.6 POINT C 0.4 0.2 0.2 POINT B 0.4 0.6 0.2 1.5 0.6 0.5 '\n",
" '0.5 0.2 -0.2 -0.4 -0.6 -0.5 -0.5 55 (a) A and C seem close '\n",
" '(original data) (b) A and C are actually far away (ISOMAP '\n",
" 'embedding) Figure 3.5: Impact of ISOMAP embedding on distances '\n",
" 'such as the Euclidean measure. The intuitive rationale for this '\n",
" 'is that only short point- to-point jumps can accurately measure '\n",
" 'minor changes in the generative process for that point.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'When a one-to-one mapping does exist, many of the '\n",
" 'multidimensional categorical distance measures can be adapted to '\n",
" 'this domain, just as the Lp-norm can be adapted to continuous '\n",
" 'time series. However, the application domains of discrete '\n",
" 'sequence data are most often such that a one-to-one mapping does '\n",
" 'not exist.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_138_outlier_analysis_TB.txt',\n",
" 'text': 'It has been shown that for most smooth functions Kh(:), when the '\n",
" 'number of data points goes to infinity; the estimate '\n",
" 'asymptotically converges to the true density value, provided that '\n",
" 'the width h is chosen appropriately. The density at each data '\n",
" 'point is computed with- out including the point itself in the '\n",
" 'density computation: The value of the density is reported as the '\n",
" 'outlier score. Low values of the density indicate greater '\n",
" 'tendency to be an outlier. Density-based methods have similar '\n",
" 'challenges as histo...'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_17_chapter_14_hierarchical_clustering_TB.txt',\n",
" 'text': 'Thus, the computational complexity of hierarchical clustering is '\n",
" 'O(n2 log n) 14.3 Further Reading Hierarchical clustering has a '\n",
" 'long history; especially in taxonomy or classificatory systems, '\n",
" 'and phylogenetics, see for example (Sokal and Sneath; 1963). The '\n",
" 'generic Lance-Williams formula for distance updates appears in '\n",
" \"(Lance and Williams, 1967). Ward's measure is from (Ward, 1963). \"\n",
" 'Efficient methods for single link and com- plete link measures '\n",
" 'with O(n2) complexity; are given in (Sibson; 1973) and (Defays...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_138_outlier_analysis_TB.txt',\n",
" 'text': 'Therefore, this book has classified this approach as a '\n",
" '(normalized) distance-based method; rather than as a '\n",
" 'density-based method: 8.5.2.2 Instance-Specific Mahalanobis '\n",
" 'Distance The instance-specific Mahalanobis distance is designed '\n",
" 'for adjusting to varying shapes of the distributions in the '\n",
" 'locality of a particular data point, as illustrated in Fig: 8.8b. '\n",
" 'The Mahalanobis distance is directly related to shape of the data '\n",
" 'distribution; although it is tra- ditionally used in the global '\n",
" 'sense: Of course, it is ...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1914_mining_time_series_data_TB.txt',\n",
" 'text': 'Specifically; the Euclidean distance of a data point to its '\n",
" 'kth-nearest neighbors is used to define the outlier score.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_2217_mining_graph_data_TB.txt',\n",
" 'text': 'Sections 17.2 and 17.3 of this chapter have discussed methods for '\n",
" 'distance computation in graphs. After a distance function has '\n",
" 'been designed, the following two methods can be used: 1_ The '\n",
" 'k-medoids method introduced in Sect. 6.3.4 in Chap. 6 uses a '\n",
" 'representative- based approach; in which the distances of data '\n",
" 'objects to their closest representatives are used to perform the '\n",
" 'clustering: A set of k representatives is used, and data objects '\n",
" 'are assigned to their closest representatives by using an '\n",
" 'appropriate...'}\n",
"\n",
"{ 'name': 'TB-v_mm_lt_bc_116_probability_and_distributions_TB.txt',\n",
" 'text': 'Just like the Euclidean distance is a special case of a metric '\n",
" '(Section 3.3) , the Kullback-Leibler divergence is a special case '\n",
" 'of two more general classes of divergences called Bregman '\n",
" 'divergences and f-divergences. The study of divergences is beyond '\n",
" 'the scope of this book, and we refer for more details to the '\n",
" 'recent book by Amari (2016), one of the founders of the field of '\n",
" 'information geometry 6.5 Gaussian Distribution The Gaussian '\n",
" 'distribution is the most well-studied probability distribution '\n",
" 'for contin...'}\n",
"\n",
"{ 'name': 'TB-v_mm_lt_bc_116_probability_and_distributions_TB.txt',\n",
" 'text': 'Recall that the probability mass (or density) is posi- tive and '\n",
" 'needs to add up to 1_ These constraints mean that distributions '\n",
" 'live on something called a statistical manifold: The study of '\n",
" 'this space of probability distributions is called information '\n",
" 'geometry Computing dis- tances between distributions are often '\n",
" 'done using Kullback-Leibler diver- gence, which is a '\n",
" 'generalization of distances that account for properties of the '\n",
" 'statistical manifold.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1712_mining_data_streams_TB.txt',\n",
" 'text': 'The Gaussian spatial kernel function Khs was used because of its '\n",
" 'well-known effectiveness. Specifically; Kha is the product of d '\n",
" 'identical gaussian kernel functions, and hs (h3 hd) , where hs '\n",
" 'is the smoothing parameter for dimension i_ The velocity '\n",
" 'density is associated with a data point as well as a time '\n",
" 'instant, and therefore this definition allows the labeling of '\n",
" 'both data points and time instants as outliers.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_72_data_preparation_TB.txt',\n",
" 'text': 'The quantiles of the Gaussian distribu- tion are used to '\n",
" 'determine the boundaries of the intervals. This is more efficient '\n",
" 'than sorting all the data values to determine quantiles, and it '\n",
" 'may be a more practical approach for a long (or streaming) time '\n",
" 'series. The values are discretized into a small number (typically '\n",
" '3 to 10) of intervals for the best results. Each such equi-depth '\n",
" 'inter- val is mapped to a symbolic value.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'The Mahalanobis distance is similar to the Euclidean distance, '\n",
" 'except that it normalizes the data on the basis of the '\n",
" 'interattribute correlations_ For example, if the axis system were '\n",
" 'to be rotated to the principal directions of the data (shown in '\n",
" 'Fig: 3.3) then the data would have no (second order) '\n",
" 'interattribute correlations.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'The Manhattan distance is the \"city block\" driving distance in a '\n",
" 'region in which the streets are arranged as a rectangular grid, '\n",
" 'such as the Manhattan Island of New York City: A nice property of '\n",
" 'the Euclidean distance is that it is rotation-invariant because '\n",
" 'the straight-line distance between two data points does not '\n",
" 'change with the orientation of the axis system. This property '\n",
" 'also means that transformations; such as PCA, SVD, or the '\n",
" 'wavelet transformation for time series (discussed in Chap.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '82 CHAPTER 3 SIMILARITY AND DISTANCES 3.4.1.4 Window-Based '\n",
" 'Methods The example in Fig: 3.7 illustrates a case where dropped '\n",
" 'readings may cause a gap in the matching: Window-based schemes '\n",
" 'attempt to decompose the two series into windows and then '\n",
" '\"stitch\" together the similarity measure The intuition here is '\n",
" 'that if two series have many contiguous matching segments, they '\n",
" 'should be considered similar: For long time series, a global '\n",
" 'match becomes increasingly unlikely.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_138_outlier_analysis_TB.txt',\n",
" 'text': 'Intuitively, when Y is in a dense region and the distance between '\n",
" 'X and Y is large: the reachability distance of X with respect to '\n",
" 'it is_equal to the true distance Dist(X,Y) On the other hand; '\n",
" 'when the distances between X and Y are small, then the '\n",
" 'reachability distance is smoothed out by the k-nearest neighbor '\n",
" 'distance of Y The larger the value of k, the greater the '\n",
" 'smoothing: Correspondingly; the reachability distances with '\n",
" 'respect to different points will also become more similar. The '\n",
" 'reason for using thi...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_138_outlier_analysis_TB.txt',\n",
" 'text': 'The Mahalanobis distance is simply equal to the Euclidean '\n",
" 'distance in such a transformed axes-rotated) data set after '\n",
" 'dividing each of the transformed coordinate values by the '\n",
" 'standard deviation along its direction This approach provides a '\n",
" 'neat way to model the probability distribution of the Mahalanobis '\n",
" 'distance, and it also provides a concrete estimate of the '\n",
" 'cumulative probability in the multivariate tail.'}\n",
"\n",
"{ 'name': 'conv_dm_and_ac_10_chapter_7_dimensionality_reduction_TB.txt',\n",
" 'text': 'Feedback is Welcome_ Note that this book shall be available for '\n",
" 'purchase from Cambridge University Press and other standard '\n",
" 'distribution channels, that no unauthorized distribution shall be '\n",
" 'allowed, and that the reader may make one copy only for personal '\n",
" 'on-screen use. CHAPTER 7. DIMENSIONALITY REDUCTION 237 Pearson, K '\n",
" '(1901) , \"On lines and planes of closest fit to systems of points '\n",
" 'in space\" The London, Edinburgh, and Dublin Philosophical '\n",
" 'Magazine and Journal of Sci- ence, 2 (11), pp. 559-572.'}\n",
"\n",
"{ 'name': 'OCR_dmc_part_1_motivation_distance_function.txt',\n",
" 'text': 'The larger p, the more large deviations in one dimension matter. '\n",
" 'For p - 0, the Minkowski distances converges to the Chebyshev '\n",
" 'distance For p Z 1, the Minkowski distance is a metric.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1914_mining_time_series_data_TB.txt',\n",
" 'text': 'The idea is to select points on the Gaussian curve, SO that the '\n",
" 'area between successive breakpoints is equal, and therefore the '\n",
" 'different symbols have approximately the same frequency: 14.2.5 '\n",
" 'Time Series Similarity Measures Time series similarity measures '\n",
" 'are typically designed with application-specific goals in mind: '\n",
" 'The most common methods for time series similarity computation '\n",
" 'are Euclidean distance and dynamic time warping (DTW) The '\n",
" 'Euclidean distance is defined in an iden- tical way to '\n",
" 'multidimensi...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Therefore, this section will address both data types, and each '\n",
" 'similarity measure will be discussed in a subsection on either '\n",
" 'continuous series or discrete series, based on its most common '\n",
" 'use For some measures, the usage is common across both data '\n",
" 'types_ 3.4.1 Time-Series Similarity Measures The design of '\n",
" 'time-series similarity measures is highly application specific. '\n",
" 'For example, the simplest possible similarity measure between two '\n",
" 'time series of equal length is the Euclidean metric. Although '\n",
" 'such a metri...'}\n",
"\n",
"{ 'name': 'conv_uml_tb_23_dimensionality_reduction_t.txt',\n",
" 'text': '6 Random Projections Preserve Inner Products: The '\n",
" 'Johnson-Lindenstrauss lemma tells us that a random projection '\n",
" 'preserves distances between a finite set of vectors.'}\n",
"\n",
"{ 'name': 'TB-v_mm_lt_bc_53_analytic_geometry_TB.txt',\n",
" 'text': ') Then d(x,y) := Ilx - yll (1 5 Y,x - y) (3.21) is called the '\n",
" 'distance between x and y for &,y € V. If we use the dot distance '\n",
" 'product as the inner product; then the distance is called '\n",
" 'Euclidean distance. Euclidean distance 02021 . 76 Analytic '\n",
" 'Geometry The mapping d :V xV _ R (1,y) + d(, y) (3.22) (3.23) '\n",
" 'metric is called a metric.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'The particular choice of discretization depends on the goals of '\n",
" 'the application at hand\\n'\n",
" '3.5. GRAPH SIMILARITY MEASURES 85 Figure 3.10: Shortest path '\n",
" 'versus homophily 3.5 Graph Similarity Measures The similarity in '\n",
" 'graphs can be measured in different ways, depending on whether '\n",
" 'the similarity is being measured between two graphs, or between '\n",
" 'two nodes in a single graph: For simplicity; undirected networks '\n",
" 'are assumed, though the measures can be easily generalized to '\n",
" 'directed networks.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Such fractional metrics can provide more effective results for '\n",
" 'the high-dimensional case. As a rule of thumb, the larger the '\n",
" 'dimensionality; the lower the value of p. However, no exact rule '\n",
" 'exists on the precise choice of p because dimensionality is not '\n",
" 'the only factor in determining the proper value of p. The '\n",
" 'precise choice of p should be selected in an '\n",
" 'application-specific way, with the use of benchmarking: The '\n",
" 'bibliographic notes contain discussions on the use of fractional '\n",
" 'metrics.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Figure 3.2b is derived from Fig: 3.2a, except that the results '\n",
" 'show the fraction of the Manhattan performance achieved by higher '\n",
" 'order norms It is evident that the rate of degradation with '\n",
" 'increasing p is higher when the dimensionality of the data is '\n",
" 'large. For 2-dimensional data, there is very little degradation. '\n",
" 'This is the reason that the value of p matters less in lower '\n",
" 'dimensional applications. 68 CHAPTER 3 SIMILARITY AND DISTANCES '\n",
" 'This argument has been used to propose the concept of fractional '\n",
" 'metric...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '2) , can be used on the data without affectingl the distance: '\n",
" 'Another interesting special case is that obtained by setting p = '\n",
" 'C The result of this computation is to select the dimension for '\n",
" 'which the two objects are the most distant from one another and '\n",
" 'report the absolute value of this distance: All other features '\n",
" 'are ignored.a The Lp-norm is one of the most popular distance '\n",
" 'functions used by data mining analysts. One of the reasons for '\n",
" 'its popularity is the natural intuitive appeal and '\n",
" 'interpretability o...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_138_outlier_analysis_TB.txt',\n",
" 'text': 'Therefore, care must be taken in the nested loop structure to '\n",
" 'ignore the trivial cases where X=Y while updating k-nearest '\n",
" 'neighbor distances. 8.5.2 Local Distance Correction Methods '\n",
" 'Section 3.2.1.8 of Chap. 3 provides a detailed discussion of the '\n",
" 'impact of the local data distribution on distance computation. In '\n",
" 'particular; it is shown that straightforward mea- sures, such as '\n",
" 'the Euclidean distance, do not reflect the intrinsic distances '\n",
" 'between data points when the density and shape of the clusters '\n",
" 'vary sig...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'MULTIDIMENSIONAL DATA 65 1.6 3.5 L1 (MANHATTAN) L2 (EUCLIDIAN) L4 '\n",
" 'L8 1.4 1.2 2.5 1 1 0.8 0.6 1 2 1 5 0.4 0.5 0.2 100 200 300 400 '\n",
" '500 600 700 800 900 DATA DIMENSIONALITY 100 10 20 30 40 50 60 70 '\n",
" '80 DATA DIMENSIONALITY 90 100 Contrasts with dimensionality (6) '\n",
" 'Contrasts with norms Figure 3.1: Reduction in distance contrasts '\n",
" 'with increasing dimensionality and norms are high dimensional '\n",
" 'because of the varying impact of data sparsity, distribution, '\n",
" 'noise, and feature relevance. This chapter will discuss these '\n",
" 'bro...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'It is important to remember that these normalization issues may '\n",
" 'not be relevant to every application. Some applications may '\n",
" 'require only translation, only scaling, or neither of the two. '\n",
" 'Other applications may require both: In fact; in some cases, the '\n",
" 'wrong choice of\\n'\n",
" '3.4.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'Virtually all data mining problems, such as clustering, outlier '\n",
" 'detection, and classification, require the computation of '\n",
" 'similarity: A formal statement of the problem of similarity 0 '\n",
" 'distance quantification is as follows: Given two objects 01 and '\n",
" '02, determine @ value of the similarity Sim(01,02) (or dis- tance '\n",
" 'Dist(01,02) ) between the two objects.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1510_data_classification_TB.txt',\n",
" 'text': 'This is a very special example where only a 1- dimensional '\n",
" 'projection works well. However , it may not be generalizable to '\n",
" 'an arbitrary data set_ A more general way of computing the '\n",
" 'distances in a class-sensitive way; is to use a soft weighting of '\n",
" 'different directions, rather than selecting specific dimensions '\n",
" 'in a hard way This can be achieved with the use of an appropriate '\n",
" 'choice of matrix A in Eq. 10.71.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1914_mining_time_series_data_TB.txt',\n",
" 'text': 'Other faster methods for approximation exist, such as the use of '\n",
" 'the SAX representation_ When the SAX representation is used, a '\n",
" 'table of precomputed distances can be maintained for all pairs of '\n",
" 'discrete values; and a simple table lookup is required for lower '\n",
" 'bounding: Furthermore, some other time series distance functions '\n",
" 'such as dynamic time warping can also be bounded from below _ The '\n",
" 'bibliographic notes contain pointers to some of these bounds.'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_1510_data_classification_TB.txt',\n",
" 'text': 'Instead of using the Euclidean distance metric, the distance '\n",
" 'between two d-dimensional points X and Y is defined with respect '\n",
" 'to a d X d matrix A Dist(X,Y) = Vr-Y)A(X 5 Y)T (10.71) This '\n",
" 'distance function is the same as the Euclidean metric when A is '\n",
" 'the identity matrix. Different choices of A can lead to better '\n",
" 'sensitivity of the distance function to the local and global data '\n",
" 'distributions. These different choices will be discussed in '\n",
" 'the following subsections. 10.8.1.1 Unsupervised Mahalanobis '\n",
" 'Metric ...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_2116_mining_spatial_data_TB.txt',\n",
" 'text': 'Therefore, mirror images can also be addressed using this '\n",
" 'approach; by incorpo rating the reversals of the series (and its '\n",
" 'rotations in the distance function This will increase the '\n",
" 'computation by a factor of 2_ The precise choice of distance '\n",
" 'function used is highly application-specific, depending on '\n",
" 'whether rotations O mirror image conversions are required: 16.2.5 '\n",
" 'Outlier Detection In the context of spatial data, outliers can be '\n",
" 'either point outliers and shape outliers. These two kinds of '\n",
" 'outliers are als...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': 'MULTIDIMENSIONAL DATA 67 3.5 1.5 1 1 1 0.5 2-DIMENSIONAL '\n",
" '10-DIMENSIONAL 100-DIMENSIONAL 2-DIMENSIONAL 10-DIMENSIONAL '\n",
" '100-DIMENSIONAL 2.5 1 1 5 0.5 VALUE of p VALUE of p (a) Contrast '\n",
" '(b) Contrast / Manhattan Contrast Figure 3.2: Impact of p on '\n",
" 'contrast subset selection during preprocessing; because the '\n",
" 'relevance of features is locally determined by the pair of '\n",
" 'objects that are being considered. Globally; all features may be '\n",
" 'relevant. When many features are irrelevant; the additive noise '\n",
" 'effects of the irrele...'}\n",
"\n",
"{ 'name': 'conv_d_mt_bch_83_similarity_and_distances_TB.txt',\n",
" 'text': '(3.22)\\n'\n",
" '86 CHAPTER 3 SIMILARITY AND DISTANCES This is the essence of the '\n",
" 'well-known Dijkstra algorithm_ This approach is linear in the '\n",
" 'number of edges in the network; because it examines each node and '\n",
" 'its incident edges exactly once The approach provides the '\n",
" 'distances from a single node to all other nodes in a single pass. '\n",
" 'The final value of SP(s,j) provides a quantification of the '\n",
" 'structural distance between node S and node j. Structural '\n",
" 'distance-based measures do not leverage the multiplicity in paths '\n",
" 'betw...'}\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "F3M_i2VSDs8y"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zuTQrNSWoH0F"
},
"source": [
"# <center> Summarized Document Search </center>\n",
"\n",
"- enter a topic, and the pipeline will get you a summarized version of the best N results for that topic\n",
"- **Use case:** \"I want to know the general definition of a concept or term\"\n",
"\n",
"\n",
"## details\n",
"- uses `SearchSummarizationPipeline`\n",
"- summarization [options](https://huggingface.co/models?filter=summarization) (*to add a new one, just edit the dropdown*)\n",
"- in general, the `bigbird` model summaries are the best as they are for longer documents. Then comes the `longformer`, then \"standard\" `pegasus` models\n",
" - details on the [longformer](https://huggingface.co/allenai/led-large-16384) by allenai"
]
},
{
"cell_type": "code",
"metadata": {
"id": "FviP58rPBDPp",
"cellView": "form",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "1405151f-6c26-4b92-c691-02ffde066bca"
},
"source": [
"sum_model = \"allenai/led-large-16384-arxiv\" # @param [\"google/bigbird-pegasus-large-bigpatent\", \"facebook/bart-large-cnn\", \"google/pegasus-reddit_tifu\", \"allenai/led-large-16384\", \"allenai/led-large-16384-arxiv\", \"google/pegasus-large\"]\n",
"\n",
"# decrease_if_crash (in this case, change)\n",
"\n",
"use_answer_fmt = True # @param {type:\"boolean\"}\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kHpqZ9SDefFL"
},
"source": [
"## model config \n",
"\n",
"- important to define some parameters for the summarizer, otherwise the summarized response may repeat itself a bunch of times, etc.\n",
"- see [transformers docs](https://huggingface.co/transformers/main_classes/configuration.html#transformers.PretrainedConfig.from_pretrained) for more detail on how this works\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "knHHDggu45tF",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "5f63863f-6041-4cc4-c3f1-86894b1ac37f"
},
"source": [
"from transformers import AutoModel\n",
"\n",
"\n",
"def customize_pegasus(hf_model_name, outfolder=\"custom_pegasus\"):\n",
" # probably will work for all pegasus-based models. need to test others\n",
"\n",
" b_size = 512 # decrease_if_crash\n",
" num_random_blocks = 3 # bigbird only\n",
" block_size = 64 # bigbird only\n",
" max_l = 512\n",
" n_beams = 16 # decrease_if_crash\n",
" len_penalty = 1\n",
" rep_penalty = 3.01\n",
" no_rpt_ngram = 2\n",
" num_return_sequences = 1\n",
" # experiment on block_size and num_random_blocks\n",
"\n",
" PEGASUS_param_names = [\n",
" \"Input Max No. Tokens (Batch)\",\n",
" \"num_random_blocks\",\n",
" \"block_size\",\n",
" \"max_length\",\n",
" \"num_beams\",\n",
" \"length_penalty\",\n",
" \"no_repeat_ngram_size\",\n",
" \"num_return_sequences\",\n",
" \"repetition_penalty \",\n",
" ]\n",
" PEGASUS_params = [\n",
" b_size,\n",
" num_random_blocks,\n",
" block_size,\n",
" max_l,\n",
" n_beams,\n",
" len_penalty,\n",
" no_rpt_ngram,\n",
" num_return_sequences,\n",
" rep_penalty,\n",
" ]\n",
"\n",
" custom_bb = AutoModel.from_pretrained(\n",
" hf_model_name,\n",
" max_length=max_l,\n",
" num_beams=n_beams,\n",
" length_penalty=len_penalty,\n",
" num_return_sequences=1,\n",
" no_repeat_ngram_size=no_rpt_ngram,\n",
" repetition_penalty=rep_penalty,\n",
" gradient_checkpointing=True, # slower but save CUDA\n",
" max_position_embeddings=4096,\n",
" )\n",
" save_path = join(os.getcwd(), outfolder)\n",
" custom_bb.save_pretrained(save_path)\n",
"\n",
" print(\"successfully created customized summarizer model\")\n",
"\n",
" return save_path\n",
"\n",
"\n",
"def customize_LED(hf_model_name, outfolder=\"custom_LED\"):\n",
" # allenai/led-base-16384 / allenai/led-large-16384\n",
"\n",
" b_size = 1024 # decrease_if_crash\n",
" max_l = 512\n",
" n_beams = 12 # decrease_if_crash\n",
" len_penalty = 3.5\n",
" rep_penalty = 3.01\n",
" no_rpt_ngram = 2\n",
" num_return_sequences = 1\n",
"\n",
" custom_bb = AutoModel.from_pretrained(\n",
" hf_model_name,\n",
" max_length=max_l,\n",
" num_beams=n_beams,\n",
" length_penalty=len_penalty,\n",
" num_return_sequences=1,\n",
" no_repeat_ngram_size=no_rpt_ngram,\n",
" repetition_penalty=rep_penalty,\n",
" gradient_checkpointing=True, # slower but save CUDA\n",
" )\n",
" save_path = join(os.getcwd(), outfolder)\n",
" custom_bb.save_pretrained(save_path)\n",
"\n",
" print(\"successfully created customized summarizer model\")\n",
"\n",
" return save_path"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BiwemyPRe5ED"
},
"source": [
"## load model"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Kn66kSLoxXMq",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 293,
"referenced_widgets": [
"b66e639d33964b909d78307b964b65fd",
"a031cd7bce7544d7a2ce54d3bff73cbb",
"72b3bdb9a04c4a9aa9fd81f2d9e6a662",
"dfb0cf603bf94789880919363c8521c9",
"bb90b97c0239499aa2e074b8b23afa90",
"634b387cb25d4582ba23976c04a6e8ed",
"70d8fb74b163420b8e1aaaa2368f98cb",
"68c585b41a9c4554af93c17530561707",
"0b3f4c76165a4a2aa5afcdace96b6255",
"4e01882dfc014bdb89e4ee2150e72306",
"bc375c8e893146ffb099077b6f86f1e8",
"5e3cea0100f44680a98b29ef506c1240",
"949b9cb8562249ad93015da60bcf0af4",
"5b82531520be4057919d8d430a945e87",
"c3ad3251d7964845912e81bed6c4933f",
"a4080cdae7314dbba708ce7a18416ea1",
"7d7b45d8466a4c44bf60ce1a161be880",
"3488f4a6cc4c4f1997f97fc387ea3427",
"f5f274fcb4754de7be311d4c05ddd9eb",
"6dc6cb3ba8084721a0557e6b9f299cd2",
"3c53c54187d14ea9b1981c837ea60290",
"b648b2fa5d0f47a585a2277f209d3b99"
]
},
"outputId": "3a6b6b2c-f976-4bab-f3f2-af42e92f6a24"
},
"source": [
"# load summarizer\n",
"from haystack.summarizer import TransformersSummarizer\n",
"\n",
"gc.collect()\n",
"\n",
"if \"pegasus\" in sum_model.lower():\n",
" # use the above function to set some parameters\n",
" print(\"using custom model parameters - PEGASUS\\n\\n\")\n",
" custom_model_path = customize_pegasus(sum_model)\n",
" model_source = custom_model_path\n",
"elif \"led\" in sum_model.lower() and \"allenai\" in sum_model.lower():\n",
" print(\"using custom model parameters - Longformer LED\\n\\n\")\n",
" custom_model_path = customize_LED(sum_model)\n",
" model_source = custom_model_path\n",
"else:\n",
" # loads straight from the hugginface hub\n",
" model_source = sum_model"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"using custom model parameters - Longformer LED\n",
"\n",
"\n"
]
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b66e639d33964b909d78307b964b65fd",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/1.29k [00:00<?, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5e3cea0100f44680a98b29ef506c1240",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/1.84G [00:00<?, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"Some weights of the model checkpoint at allenai/led-large-16384-arxiv were not used when initializing LEDModel: ['final_logits_bias', 'lm_head.weight']\n",
"- This IS expected if you are initializing LEDModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing LEDModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"successfully created customized summarizer model\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "0hztjKONcRoG",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "d5bdccde-253b-4db9-d243-0df0b89e2d18"
},
"source": [
"%%capture\n",
"gc.collect()\n",
"summarizer = TransformersSummarizer(\n",
" model_name_or_path=model_source, tokenizer=sum_model, min_length=32, \n",
" max_length=256, generate_single_summary=True,\n",
" separator_for_single_summary=\" \",\n",
" \n",
")"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "P-rTpMmxp_vl",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "cfc92765-ea28-4f24-80d5-f91dc41de927"
},
"source": [
"# initialize summarize + search pipeline\n",
"from haystack.pipeline import SearchSummarizationPipeline\n",
"\n",
"sumsearch_pipe = SearchSummarizationPipeline(summarizer=summarizer, \n",
" retriever=retriever, \n",
" return_in_answer_format=use_answer_fmt)\n",
"\n",
"# sumsearch_pipe = SearchSummarizationPipeline(summarizer, \n",
"# retriever,\n",
"# return_in_answer_format=use_answer_fmt)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Xm49XvWABcVG"
},
"source": [
"## single query\n",
"\n",
"- `num_results` is how many documents it pulls to summarize. \n",
" - If the summarizer is not configured correctly, it will start repeating itself as the number gets higher\n",
" - increasing this parameter loads the GPU more"
]
},
{
"cell_type": "code",
"metadata": {
"id": "DlZWhub2Bdmb",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 953
},
"cellView": "form",
"outputId": "67fb195f-bd31-4485-bf5d-dc3bb29e63fb"
},
"source": [
"ss_query = \"k-means\" # @param {type:\"string\"}\n",
"num_results = 50# @param {type:\"integer\"} # decrease_if_crash\n",
"w_one_sum = True # @param {type:\"boolean\"}\n",
"\n",
"search_result = sumsearch_pipe.run(\n",
" query=ss_query,\n",
" params={\"retriever\": {\"top_k\": num_results}, \n",
" \"summarizer\": {\"generate_single_summary\": w_one_sum,\n",
" \"truncation\":4096}},\n",
")\n",
"# print_documents(search_result, max_text_len=256)\n",
"answers_list = search_result[\"answers\"]\n",
"count = 0\n",
"for entry in answers_list:\n",
"\n",
" this_answer = clean_output(entry[\"answer\"])\n",
" this_context = clean_output(entry[\"context\"])\n",
" count += 1\n",
" print(\"\\n\\nItem #{} - answer\\n\".format(count))\n",
" pp.pprint(this_answer)\n",
" print(\"\\n the context (first 2k chars) is: \\n\")\n",
" pp.pprint(this_context[:2000] + \"...\")\n",
"\n",
" new_row_sum = {\n",
" \"query\": ss_query,\n",
" \"response\": this_answer,\n",
" \"query_type\": \"summary_search\",\n",
" \"doc_group\": course_name,\n",
" \"model_name\": sum_model,\n",
" \"context\": this_context,\n",
" }\n",
" info_queries = info_queries.append(new_row_sum, ignore_index=True)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"\n",
"Item #1 - answer\n",
"\n",
"('cluster analysis is often used to provide & descriptive statistic for '\n",
" 'ascertaining the extent to which the observations comprising the data base '\n",
" 'fall into natural distinct groupings. however, it is crucial to have the '\n",
" 'ability to detect near duplicates in order to assign many documents '\n",
" 'meaningfully to clusters when they do not contain a significant number of '\n",
" 'words from this small lexicon subset. an approach known as shingling is '\n",
" 'commonly used for this purpose: a k-shingle from a document is simply a '\n",
" 'string of k consecutively occurring words in the document. 2_ stemming '\n",
" 'refers to common root extraction from words, and the extracted root may not '\n",
" 'even be a word in of itself. Of course, the drawback is that the word hop '\n",
" 'has a different meaning and usage of its own. Nevertheless, stemming usually '\n",
" 'enables higher quality results in mining applications 3_ Punctuation marks: '\n",
" 'after stemming has been performed, punctuation mark such as commas and '\n",
" 'semicolons are removed. 4. The fraction of transactions This rule was '\n",
" 'derived in some early publications on supermarket data. notice that page 4 '\n",
" 'has no incoming links and hence gets the minimum PageRank of 0.15_ '\n",
" 'bibliographic Notes there are many books on clustering, including Hartigan '\n",
" '(1975), Gordon (1999) and')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('The number of distinct words in these five documents will typically be a '\n",
" 'very small subset of the entire lexicon: Therefore, the first few iterations '\n",
" 'of k-means may not be able to assign many documents meaningfully to clusters '\n",
" 'when they do not contain a significant number of words from this small '\n",
" 'lexicon subset. This initial 13.3. Therefore, it is crucial to have the '\n",
" 'ability to detect near duplicates. An approach known as shingling is '\n",
" 'commonly used for this purpose: A k-shingle from a document is simply a '\n",
" 'string of k consecutively occurring words in the document . A shingle can '\n",
" 'also be viewed as a k-gram: For example, consider the document comprising '\n",
" 'the following sentence: Mary had a little lamb, its fleece was white aS '\n",
" 'snou. In other words; a k-itemset is a set of items of cardinality k. The '\n",
" 'fraction of transactions This rule was derived in some early publications on '\n",
" 'supermarket data. No assertion is made here about the likelihood of such a '\n",
" 'rule appearing in an arbitrary supermarket data set. 4.2. In many cases, '\n",
" 'stemming refers to common root extraction from words, and the extracted root '\n",
" 'may not even be a word in of itself. For example, the common root of hoping '\n",
" 'and hope is hop. Of course, the drawback is that the word hop has a '\n",
" 'different meaning and usage of its own. Therefore, while stemming usually '\n",
" 'improves recall in document retrieval, it can sometimes worsen precision '\n",
" 'slightly. Nevertheless, stemming usually enables higher quality results in '\n",
" 'mining applications 3_ Punctuation marks: After stemming has been performed, '\n",
" 'punctuation marks, such as commas and semicolons, are removed. For example, '\n",
" 'a company may employ K sales people, and the goal is to partition a customer '\n",
" 'database into K segments; one for each sales person, such that the customers '\n",
" 'assigned to each one are as Similar as possible. Often, however cluster '\n",
" 'analysis is used to provide & descriptive statistic for ascertaining the '\n",
" 'extent to which the observations comprising the data base fall into '\n",
" 'natural...')\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "hzY_tJzzSR1w",
"cellView": "form",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 865
},
"outputId": "2d15e12b-bf90-4d30-b7a8-1631a33aa209"
},
"source": [
"ss_query = \"metric\" # @param {type:\"string\"}\n",
"num_results = 60# @param {type:\"integer\"} # decrease_if_crash\n",
"w_one_sum = True # @param {type:\"boolean\"}\n",
"\n",
"search_result = sumsearch_pipe.run(\n",
" query=ss_query,\n",
" params={\"retriever\": {\"top_k\": num_results}, \"summarizer\": {\"generate_single_summary\": w_one_sum}},\n",
")\n",
"# print_documents(search_result, max_text_len=256)\n",
"answers_list = search_result[\"answers\"]\n",
"count = 0\n",
"for entry in answers_list:\n",
"\n",
" this_answer = clean_output(entry[\"answer\"])\n",
" this_context = clean_output(entry[\"context\"])\n",
" count += 1\n",
" print(\"\\n\\nItem #{} - answer\\n\".format(count))\n",
" pp.pprint(this_answer)\n",
" print(\"\\n the context (first 2k chars) is: \\n\")\n",
" pp.pprint(this_context[:2000] + \"...\")\n",
"\n",
" new_row_sum = {\n",
" \"query\": ss_query,\n",
" \"response\": this_answer,\n",
" \"query_type\": \"summary_search\",\n",
" \"doc_group\": course_name,\n",
" \"model_name\": sum_model,\n",
" \"context\": this_context,\n",
" }\n",
" info_queries = info_queries.append(new_row_sum, ignore_index=True)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"\n",
"Item #1 - answer\n",
"\n",
"('the most common distance function for quantitative data is the lp-norm : 1/p '\n",
" 'Dist(x,y) = ixci - yi|p i=1 (3.1) 2 special cases of the metric are the '\n",
" 'Euclidean (p 2 ) and the Manhattan ( p = 1 ) metrics. these two metrics '\n",
" 'derive their intuition from spatial applications where they have clear '\n",
" 'physical interpretability, such as city block driving distances in a region '\n",
" 'in which the streets are arranged as a rectangular grid ; or the number of '\n",
" 'passenger cars per capita in the state of united states. this section will '\n",
" 'therefore study each of these types separately: 3.2.0 quantitative data ; '\n",
" '4.4 generalizing linear discriminant analysis ; 5.7 differential time series '\n",
" '; 6.6 nonlinear embeddings ; 7.8 covariance matrix ; 8.9 regular '\n",
" 'distribution channels ; 9.10 normalization ; 10.11 standard deviation ; '\n",
" '11.12 uniformity ; 12.13 spherical ker- nel ; 13.14 analytic geometry ; '\n",
" '14.15 orthogonality ; 15.16 geometry ; 16.17 mathematics ; 17.18 computer '\n",
" 'science ; 18.19 mathematical sciences ; 19.20 physics ; 20.25.')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('Metric and at this point I may briefly raise an issue about terminology in '\n",
" 'general so the technical term so obviously the technical term for something '\n",
" 'that satisfies these axioms is metric people use the term metric or distance '\n",
" 'sometimes int Erchangeably though, so this is to be a little bit careful. '\n",
" 'The default spherical ker- nel (6.13) gives equal weight to each coordinate, '\n",
" 'and SO a natural default strategy is to standardize each variable to unit '\n",
" 'standard deviation. This section will therefore study each of these types '\n",
" 'separately: 3.2.1 Quantitative Data The most common distance function for '\n",
" 'quantitative data is the Lp-norm: The Lp-norm between two data points X = '\n",
" '(11 84) and Y = (y1 Ya) is defined as follows: 1/p Dist(X,Y) = Ixci - yi|p '\n",
" 'i=1 (3.1) Two special cases of the Lp-norm are the Euclidean (p 2) and the '\n",
" 'Manhattan (p = 1) metrics These special cases derive their intuition from '\n",
" 'spatial applications where they have clear physical interpretability The '\n",
" 'Euclidean distance is the straight-line distance between two data points. '\n",
" 'The Manhattan distance is the \"city block\" driving distance in a region in '\n",
" 'which the streets are arranged as a rectangular grid, such as the Manhattan '\n",
" 'Island of New York City: A nice property of the Euclidean distance is that '\n",
" 'it is rotation-invariant because the straight-line distance between two data '\n",
" 'points does not change with the orientation of the axis system. This '\n",
" 'property also means that transformations; such as PCA, SVD, or the wavelet '\n",
" 'transformation for time series (discussed in Chap. Furthermore, the mean can '\n",
" 'be somewhat misleading in that it is typically not a value that occurs in '\n",
" 'the sample, and it may not even be a value that the random variable can '\n",
" 'actually assume (for a discrete random variable)_ For example, the number of '\n",
" 'cars per capita is an integer valued random variable, but according to the '\n",
" 'US Bureau of Transportation Studies, the average number of passenger cars in '\n",
" 'the US was 0.45 in 2008 (137.1 million cars, with a pop...')\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Ppx_dPrQe-D1"
},
"source": [
"## query list of terms"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "gnkGwlBGazgG"
},
"source": [
"### custom search function"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4kY-l4ZXaRnX",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "d1735375-699c-4bc2-d4a7-f0a026a1ece3"
},
"source": [
"from google.colab import files\n",
"import pprint as pp\n",
"\n",
"\n",
"def search_for_terms(\n",
" srch_pipe,\n",
" st_list,\n",
" k_search=50,\n",
" one_answer=True,\n",
" answer_fmt=True,\n",
" export_txt=True,\n",
" no_context_in_txt=True,\n",
" add_text=\"\",\n",
" doc_ext=\".md\",):\n",
" # if you wanna just save as markdown change doc_ext=\".md\"\n",
" gc.collect()\n",
" st_document = []\n",
" global info_queries\n",
" spacer = \"\\n------------------\\n\"\n",
" for search_term in tqdm(\n",
" st_list, total=len(st_list), desc=\"getting defs for search_terms...\"\n",
" ):\n",
" print(spacer)\n",
"\n",
" search_term_text = \"search_term {} of {}: {}\".format(\n",
" st_list.index(search_term), len(st_list), search_term\n",
" )\n",
" search_term_text = search_term_text.replace(\"<n>\", \" \")\n",
" st_document.append(spacer)\n",
" st_document.append(\"\\n### \" + search_term_text + \" \\n+++++++\")\n",
" pp.pprint(search_term_text)\n",
" print(\"\\n\")\n",
" search_out = srch_pipe.run(\n",
" query=search_term,\n",
" params={\"retriever\": {\"top_k\": k_search}, \n",
" \"summarizer\": {\"generate_single_summary\": one_answer}},\n",
" )\n",
" search_answers = search_out[\"answers\"]\n",
" count = 0\n",
" print(\"\\n+++++++\\n\")\n",
" for memo in search_answers:\n",
"\n",
" this_answer = clean_output(memo[\"answer\"])\n",
" this_context = clean_output(memo[\"context\"])\n",
" count += 1\n",
" this_answer_header = \"\\nmodel description: #{} of {}\\n\".format(\n",
" count, len(answers_list)\n",
" )\n",
" print(this_answer_header)\n",
" pp.pprint(this_answer)\n",
" this_context_header = \"\\n the context (first 2k chars) is: \\n\"\n",
" print(this_context_header)\n",
" pp.pprint(this_context[:2000] + \"...\")\n",
" if no_context_in_txt:\n",
" # set to true if you want the text file to have context\n",
" this_context_header = \"\"\n",
" this_context = \"\"\n",
" st_document.extend(\n",
" [\n",
" \"\\n\" + this_answer_header,\n",
" this_answer,\n",
" this_context_header,\n",
" this_context + \"\\n\",\n",
" ]\n",
" )\n",
"\n",
" new_row_sum = {\n",
" \"query\": search_term,\n",
" \"response\": this_answer,\n",
" \"query_type\": \"summary_search\",\n",
" \"doc_group\": course_name,\n",
" \"model_name\": sum_model,\n",
" \"context\": this_context,\n",
" }\n",
" info_queries = info_queries.append(new_row_sum, ignore_index=True)\n",
"\n",
" date_time = datetime.now().strftime(\"%m.%d.%Y_%H-%M\")\n",
" this_outname = (\n",
" remove_string_extras(\n",
" \"SumSearch-_{}_exp_{}\".format(course_name, add_text)\n",
" + \"_\"\n",
" + sum_model\n",
" + \"_\"\n",
" + date_time\n",
" + \"_\"\n",
" )\n",
" + doc_ext\n",
" )\n",
" with open(this_outname, \"w\", encoding=\"utf-8\", errors=\"ignore\") as ss_f:\n",
" ss_f.writelines(st_document)\n",
" put_in_dropbox(this_outname)\n",
" if export_txt:\n",
" files.download(this_outname)\n",
"\n",
" print(\"\\nCompleted Summary Search - \", date_time)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "bh8Pjm1Td-TI",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "3fda407f-8fe5-43b1-bbc0-9cbc16c21fae"
},
"source": [
"my_terms = [\n",
" \"distance metric\",\n",
" \"manhattan distance\",\n",
" \"hamming distance\",\n",
" \"euclidean distance\",\n",
" \"chebyshev distance\",\n",
" \"minkowski distance\",\n",
" \"function homogeneity\",\n",
" \"translation invariance\",\n",
"]"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "LQNc__1AAm2I",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "940f3958-6638-41de-960a-8b8ef8ddf6e6"
},
"source": [
"PR_ML = [\n",
" \"data set\",\n",
" \"gaussian distribution\",\n",
" \"log likelihood function\",\n",
" \"linear regression model\",\n",
" \"graphical models\",\n",
" \"latent variables\",\n",
" \"error function\",\n",
" \"training data\",\n",
" \"conditional mixture models\",\n",
" \"mixture density network\",\n",
" \"log marginal likelihood\",\n",
" \"posterior probability\",\n",
" \"maximum\",\n",
" \"relevance vector machine\",\n",
" \"directed graph\",\n",
" \"bayesian model\",\n",
" \"model parameters\",\n",
" \"class\",\n",
" \"sum of squares error\",\n",
" \"multivariate gaussian\",\n",
" \"synthetic data points\",\n",
" \"probabilistic pca\",\n",
" \"component analysis\",\n",
" \"sampling\",\n",
" \"markov chain\",\n",
" \"output unit activation\",\n",
"]\n",
"\n",
"past_exams = [\n",
"\n",
"]"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IbopeVKyqfw-"
},
"source": [
"## Run List of Search Terms\n",
"\n",
"- depending on database size, number of questions, so on, may need to run the QA list and search term list separately "
]
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "KbFUqxvb89q8",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "2830d6b1-7743-431c-d2eb-566dadedcd13"
},
"source": [
"run_search_term_list = True # @param {type:\"boolean\"}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "XpHGprmge-8Y",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000,
"referenced_widgets": [
"0c0a7675aad74a748c35333306417836",
"b042e089cd224ccd8ec9ac21b78e5a55",
"a013a190a0794204877a1c4268739da2",
"5c08be3930674a4db40bfc5fa9689b55",
"3a9471cd4cf449768add43beb1116ffc",
"baf5f95a331144449b8d956019d456af",
"3fbbac66544b413d8b09a447774a875f",
"e3e5b903034042459eb19d84d86822c3",
"1f949b6cffce4ec9be310bb15bbe662e",
"7fff9a716ebf449888de5a08bd4eed66",
"61758c7ae2094fec960c5c0533f65366"
]
},
"outputId": "b4650a0c-d2f7-459b-d88c-80d39f31874d"
},
"source": [
"if run_search_term_list:\n",
" # iterates through a primary list of terms in `my_terms`\n",
" search_for_terms(\n",
" sumsearch_pipe,\n",
" my_terms,\n",
" k_search=60, # decrease_if_crash\n",
" add_text=\"_main_{}_\".format(questions_version),\n",
" export_txt=False,\n",
" )"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0c0a7675aad74a748c35333306417836",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"getting defs for search_terms...: 0%| | 0/8 [00:00<?, ?it/s]"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"------------------\n",
"\n",
"'search_term 0 of 8: distance metric'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('time-series and discrete-sequence similarity measures are closely related '\n",
" 'because the latter can be considered the categorical version of the former. '\n",
" 'a more sophisticated approach; referred to as isomap, uses nonlinear '\n",
" 'embeddings to account for minor changes in the generative process for that '\n",
" 'point. the intuitive rationale for this is that only short point- to-point '\n",
" 'jumps from data points to one of their k-nearest neighbors based on a '\n",
" 'standard metric reflects the aggregate change (distance ) from one point to '\n",
" 'another (distant) point more accurately than a straight-line distance '\n",
" 'between the points. 1/p Dist(X,Y) = Ixci - yi|p i=1 (3.1) 2 special cases of '\n",
" 'lp-norms such as the Euclidean measure ; the bibliographic notes contain '\n",
" 'pointers to some of these bounds. 559-572.6 : clustering stability for the '\n",
" 'principal compo nents dataset, with n = 150, using the K-means algorithm. '\n",
" '12.4: generalizing linear Discriminant Analysis 439 3.5: impact of ISOMAP '\n",
" 'embedding on distances along principal components according to their '\n",
" 'variance. 10.10: we study the clust')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('The Manhattan distance is the \"city block\" driving distance in a region in '\n",
" 'which the streets are arranged as a rectangular grid, such as the Manhattan '\n",
" 'Island of New York City: A nice property of the Euclidean distance is that '\n",
" 'it is rotation-invariant because the straight-line distance between two data '\n",
" 'points does not change with the orientation of the axis system. This '\n",
" 'property also means that transformations; such as PCA, SVD, or the wavelet '\n",
" 'transformation for time series (discussed in Chap. Among the three data '\n",
" 'points A, B, and C, which pair are the closest to one another? At first '\n",
" 'sight; it would seem that data points A and B are the closest on the basis '\n",
" 'of Euclidean distance. However, the global data distribution tells us '\n",
" 'otherwise. One way of understanding distances is as the shortest length of '\n",
" 'the path from one data point to another, when using only point-to-point '\n",
" 'jumps from data points to one of their k-nearest neighbors based on a '\n",
" 'standard metric 3.2. Therefore, the overall sum of the point-to-point jumps '\n",
" 'reflects the aggregate change (distance) from one point to another (distant) '\n",
" 'point more accurately than a straight-line distance between the points. Such '\n",
" 'distances are referred to as geodesic distances: In the case of Fig: 3.4, '\n",
" 'the only way to walk from A to B with short point-to-point jumps is to walk '\n",
" 'along the entire elliptical shape of the data distribution while passing C '\n",
" 'along the way: Therefore, A and B are actually the farthest pair of data '\n",
" 'points (from A; B, and C) on this basis! Conversely; a pair of objects are '\n",
" 'unlikely to have similar values across many attributes, just by chance, '\n",
" 'unless these attributes were relevant _ Interestingly, the Euclidean metric '\n",
" '(and Lp-norm in general) achieves exactly the opposite effect by using the '\n",
" 'squared sum of the difference in attribute values: As a result; the \"noise\" '\n",
" 'components from the irrelevant attributes dominate the computation and mask '\n",
" 'the similarity effects of a large number of relevant attributes. The ...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 1 of 8: manhattan distance'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('this chapter will discuss these broader principles in the context of '\n",
" 'distance function design. for this exercise the idea was for you to '\n",
" 'implement individual distances, such as the Manhattan distance the humming '\n",
" 'distance and the Euclidean distance. all of these using Namibia can be '\n",
" 'implemented in a single line of victory arithmetic because it has all these '\n",
" 'neat operations that. Called distance under Skol of a dot pie and this '\n",
" 'already contains the headers or the prototypes for the following distance '\n",
" 'functions Manhattan dish hammering dish including his cheviche DI menkowski '\n",
" 'did never mind what those are now you will learn about this is actually one '\n",
" 'of the simplest solutions that you could have this. In this case, because '\n",
" 'one end point is the origin, and all coordinates are nonnegative, the '\n",
" 'manhattan distance will sum up the coordinates of X over the different '\n",
" 'dimensions each coordinate is uniformly distributed in [0,1]. Therefore; if '\n",
" 'Yi represents the uniform distributed random variable in @x(y - 0 ) i=1 '\n",
" '(3.3) = 3 and d = 4.5: impact of isomap embedding on distances such at the '\n",
" 'expense of data sparsity, distribution, noise, and feature relevance. '\n",
" 'mULTIDIMENSIONAL DATA 71 0.6 POINT b')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('The Manhattan distance is the \"city block\" driving distance in a region in '\n",
" 'which the streets are arranged as a rectangular grid, such as the Manhattan '\n",
" 'Island of New York City: A nice property of the Euclidean distance is that '\n",
" 'it is rotation-invariant because the straight-line distance between two data '\n",
" 'points does not change with the orientation of the axis system. This '\n",
" 'property also means that transformations; such as PCA, SVD, or the wavelet '\n",
" 'transformation for time series (discussed in Chap. And both the Manhattan '\n",
" 'and the j The hamming distance seems to satisfy this requirement and to a '\n",
" 'lesser extent the exclusion distance, however the church and the minkowsky '\n",
" 'metrics they give the same average distance and we can take a look at this '\n",
" 'this is not super required here but it always helps to have an example at '\n",
" 'hand so there is an. A distance which can be a little bit confusing because '\n",
" 'if you if you think about the properties here it has some d Different '\n",
" 'properties than a Manhattan distance which is called a distance, but which '\n",
" \"is in a sense a real distance, whereas here you don't have this because \"\n",
" 'these properties that you have the identity of indiscernible and the. '\n",
" 'Therefore_ most of the points in the cube lie within a distance range of '\n",
" \"Dmax Dmin 60 '3d from the origin_ Note that the expected Manhattan distance \"\n",
" 'grows with dimensionality at a rate that is linearly proportional to d. '\n",
" 'Therefore; the ratio of the variation in the distances to the absolute '\n",
" 'values that is referred to as Contrast(d) is given by: Dmar Dmin Contrast '\n",
" \"(d) = '12/d. Called distance under Skol of a dot pie and this already \"\n",
" 'contains the headers or the prototypes for the following distance functions '\n",
" 'Manhattan dish hammering dish including his cheviche DI menkowski did never '\n",
" 'mind what those are now you will learn about this in the lecture and your '\n",
" 'goal I S too. Late these distances and return their value for the two '\n",
" 'countries and I have to make I have to point out one cabinet here because we '\n",
" 'are getti...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 2 of 8: hamming distance'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('for this exercise the idea was for you to implement individual distances, '\n",
" 'such as the Manhattan distance the humming distance, the Euclidean distance. '\n",
" 'all of these using Namibia can be implemented in a single line of victory '\n",
" 'arithmetic because it has all these neat operations that. Of equality, they '\n",
" 'actually make a lot of intuitive sense so the identity of indiscernibles '\n",
" \"means that that the distance really tells you whether you're close to the to \"\n",
" 'source point or not and the t w if that were a distance measure then it '\n",
" 'would tell you some places in The world even though you are not. So you '\n",
" 'should now be seeing my editor and this is actually one of the simplest '\n",
" 'solutions that you could have this. Asked the for and I was but habitat to '\n",
" 'answer the form and was asked will we go by strides of one when we '\n",
" 'determined the set of three minutes do we always have a race the sliding '\n",
" 'window approach and we just. As if you have as a simile Rity measure just '\n",
" 'the caution of an expemencation of uploading wicasa different your clean '\n",
" 'distant squared there you had the stigma which gives you a natural length '\n",
" 'scale if ya choose for example this perimeter of colonel too large '\n",
" 'everything will be so. You count it as one if it coincides with')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('CATEGORICAL ATTRIBUTES 99 Euclidean Distance The Euclidean distance between '\n",
" 'Xi and Xj is given as T 6(xi, Xj) = Ilxi 5 Xj ll Xi Xi 2xiXj +xjXj 2d S '\n",
" \"Thus, the maximum Euclidean distance between any two points is '2d, which \"\n",
" 'hap- pens when there are n0 common symbols between them, i.e., when s = 0. '\n",
" 'Hamming Distance The Hamming distance between Xi and Xj is defined as the '\n",
" 'number of mismatched values ou (xi,xj) = d-8 = 1 2 6(Xi,- X;)? Hamming '\n",
" 'distance is thus equivalent to half the squared Euclidean distance. Of '\n",
" 'equality, they actually make a lot of intuitive sense so the identity of '\n",
" \"indiscernibles means that that the distance really tells you whether you're \"\n",
" 'close to the to the source point or not and the t w if that were a distance '\n",
" 'measure then it would tell you to some places in The world, it would just '\n",
" \"tell you here you're very close even though you are not. So you should now \"\n",
" 'be seeing my editor and this is actually one of the simplest solutions that '\n",
" 'you could have this. For this exercise the idea was for you to implement '\n",
" 'Individual distances, such as the Manhattan distance the humming distance '\n",
" 'the Euclidean distance and so on and so forth and all of these using Namibia '\n",
" 'can be implemented in a single line of victory arithmetic because Namibia '\n",
" 'has all these neat operations that. As if you have as a simile Rity measure '\n",
" 'just the caution of the expemencation of the uploading wicasa different your '\n",
" 'clean distant squared there you have the stigma which gives you a natural '\n",
" 'length scale if you choose for example this perimeter of the colonel too '\n",
" 'large everything will be so. Asked the for and I was but habitat to answer, '\n",
" 'but that I did it mean this comes really down to the definition of your base '\n",
" 'colonel or ugly reply the base colonel and this was asked in the form and '\n",
" 'was asked will, do we go by strides of one when we determined the set of '\n",
" 'three minutes do we always have a race the sliding window approach and we '\n",
" 'just. Therefore, the overall sum of the point-to-point jumps refl...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 3 of 8: euclidean distance'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('distances are defined as the shortest length of the path from one data point '\n",
" 'to another, when using only point-to-point jumps from data points to one of '\n",
" 'their k-nearest neighbors based on a standard metric 3.2.1. specifically; '\n",
" 'the Euclidean distance is used to define the outlier score. for this '\n",
" 'exercise the idea was for you to implement individual distances, such as ( 1 '\n",
" ') the Manhattan distance the \"city block\" driving distance in a region in '\n",
" 'which the streets are arranged as a rectangular grid ; ( 2)the honey - '\n",
" 'humming distance and ( 3 ) so on and so forth and all of these using Namibia '\n",
" 'can be implemented in single line of victory arithmetic because it has all '\n",
" 'these neat operations that. So you should now be seeing my editor and this '\n",
" 'is actually one simple solution that you could have this. 66 chAPTER 3 '\n",
" 'SIMILARITY AND DISTANCES to better understand the impact of dimensionality '\n",
" 'curse on distances; let us examine a unit cube of dimensionalality d that is '\n",
" 'fully located in the nonnegative quadrant, with one corner at the origin '\n",
" '0.01 inside the cube. among the three data pairs, which pair are the closest '\n",
" 'on the basis of geodesic distances')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('The Manhattan distance is the \"city block\" driving distance in a region in '\n",
" 'which the streets are arranged as a rectangular grid, such as the Manhattan '\n",
" 'Island of New York City: A nice property of the Euclidean distance is that '\n",
" 'it is rotation-invariant because the straight-line distance between two data '\n",
" 'points does not change with the orientation of the axis system. This '\n",
" 'property also means that transformations; such as PCA, SVD, or the wavelet '\n",
" 'transformation for time series (discussed in Chap. Therefore, the overall '\n",
" 'sum of the point-to-point jumps reflects the aggregate change (distance) '\n",
" 'from one point to another (distant) point more accurately than a '\n",
" 'straight-line distance between the points. Such distances are referred to as '\n",
" 'geodesic distances: In the case of Fig: 3.4, the only way to walk from A to '\n",
" 'B with short point-to-point jumps is to walk along the entire elliptical '\n",
" 'shape of the data distribution while passing C along the way: Therefore, A '\n",
" 'and B are actually the farthest pair of data points (from A; B, and C) on '\n",
" 'this basis! 66 CHAPTER 3 SIMILARITY AND DISTANCES To better understand the '\n",
" 'impact of the dimensionality curse on distances; let US examine a unit cube '\n",
" 'of dimensionality d that is fully located in the nonnegative quadrant, with '\n",
" 'one corner at the origin 0. What is the Manhattan distance of the corner of '\n",
" 'this cube (say; at the origin) to a randomly chosen point X inside the cube? '\n",
" 'Among the three data points A, B, and C, which pair are the closest to one '\n",
" 'another? At first sight; it would seem that data points A and B are the '\n",
" 'closest on the basis of Euclidean distance. However, the global data '\n",
" 'distribution tells us otherwise. One way of understanding distances is as '\n",
" 'the shortest length of the path from one data point to another, when using '\n",
" 'only point-to-point jumps from data points to one of their k-nearest '\n",
" 'neighbors based on a standard metric 3.2. Specifically; the Euclidean '\n",
" 'distance of a data point to its kth-nearest neighbors is used to define the '\n",
" 'outlie...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 4 of 8: chebyshev distance'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('this chapter studies the concept of linear indepen- dence. it is defined as '\n",
" 'the aggregate change (distance ) from one point to another (distant) point '\n",
" 'more accurately than a straight-line distance between the points. such '\n",
" 'distances are referred to as geodesic distances : only short point- to-point '\n",
" 'jumps can accurately measure minor changes in the generative process for '\n",
" 'that point. therefore, the overall sum of the point - to - point jumps '\n",
" 'reflects the aggregated change better than an individual straight line '\n",
" 'distance. 2.5 Linear Independence 41 to describe the location of Kigali '\n",
" 'because the geographic coordinate sys- tem may be considered a '\n",
" 'two-dimensional vector space (ignoring altitude and the Earth s curved '\n",
" 'surface ) _ The person may add, \"It is about 751km West of here: although '\n",
" 'this last statement is true, it does not necessary to find out its location '\n",
" 'given the previous information ( see @xmath0 for an illus- tration ). '\n",
" 'linearly dependent linear independent Example 213 (Linearly Dependent '\n",
" 'Vectors ) a geographic example may help to clarify the notion of nonlinear '\n",
" 'independence. 3.8: Probability p(xTx < -2n(a)), with & = 0.')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('MULTIDIMENSIONAL DATA 71 0.6 POINT B POINT C POINT A 0.4 0.8 0.2 POINT A 0.6 '\n",
" 'POINT C 0.4 0.2 0.2 POINT B 0.4 0.6 0.2 1.5 0.6 0.5 0.5 0.2 -0.2 -0.4 -0.6 '\n",
" '-0.5 -0.5 55 (a) A and C seem close (original data) (b) A and C are actually '\n",
" 'far away (ISOMAP embedding) Figure 3.5: Impact of ISOMAP embedding on '\n",
" 'distances such as the Euclidean measure. The intuitive rationale for this is '\n",
" 'that only short point- to-point jumps can accurately measure minor changes '\n",
" 'in the generative process for that point. Last measurement in in a prime two '\n",
" 'a minus one and a minus one now if we compared the two distance x in prime '\n",
" 'simply using a pictorial distance function minkovsky distance and we Would '\n",
" 'do it just like that we go from the last measurement for the first, but one '\n",
" 'first, but last second butlers at one more step back and another step back '\n",
" 'and so. Therefore, the overall sum of the point-to-point jumps reflects the '\n",
" 'aggregate change (distance) from one point to another (distant) point more '\n",
" 'accurately than a straight-line distance between the points. Such distances '\n",
" 'are referred to as geodesic distances: In the case of Fig: 3.4, the only way '\n",
" 'to walk from A to B with short point-to-point jumps is to walk along the '\n",
" 'entire elliptical shape of the data distribution while passing C along the '\n",
" 'way: Therefore, A and B are actually the farthest pair of data points (from '\n",
" 'A; B, and C) on this basis! 2.5 Linear Independence 41 to describe the '\n",
" 'location of Kigali because the geographic coordinate sys- tem may be '\n",
" \"considered a two-dimensional vector space (ignoring altitude and the Earth's \"\n",
" 'curved surface) _ The person may add, \"It is about 751km West of here: '\n",
" 'Although this last statement is true, it is not necessary to find Kigali '\n",
" 'given the previous information (see Figure 2.7 for an illus- tration) . '\n",
" 'linearly dependent linearly independent Example 2.13 (Linearly Dependent '\n",
" 'Vectors) A geographic example may help to clarify the concept of linear '\n",
" 'indepen- dence. A person in Nairobi (Kenya) describing where Kiga...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 5 of 8: minkowski distance'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('this chapter will discuss these broader principles in the context of '\n",
" 'distance function design for data sparsity, distribution, noise, and feature '\n",
" 'relevance. it will focus on the concept of dimensionality and norms that are '\n",
" 'high dimensional because of the varying impact of data stochasticity, '\n",
" 'quality control, point - to - point jumps, or transformations ; such as PCA, '\n",
" 'sVD, or the wavelet transformation for time series (discussed in Chap.3 ). + '\n",
" '* key words : * minkowski metrics with d = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, '\n",
" '11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, '\n",
" '30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 50, 51, 52, 53, 54, 55, 60, 65, '\n",
" '70, 75, 80, 85, 90, 95, 100, 110, 115, 120, 130, 140, 150, 170, 200, 220, '\n",
" '240, 250, 300, 350, 400, 500, 600, 700, 800')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('The Manhattan distance is the \"city block\" driving distance in a region in '\n",
" 'which the streets are arranged as a rectangular grid, such as the Manhattan '\n",
" 'Island of New York City: A nice property of the Euclidean distance is that '\n",
" 'it is rotation-invariant because the straight-line distance between two data '\n",
" 'points does not change with the orientation of the axis system. This '\n",
" 'property also means that transformations; such as PCA, SVD, or the wavelet '\n",
" 'transformation for time series (discussed in Chap. Therefore_ most of the '\n",
" \"points in the cube lie within a distance range of Dmax Dmin 60 '3d from the \"\n",
" 'origin_ Note that the expected Manhattan distance grows with dimensionality '\n",
" 'at a rate that is linearly proportional to d. Therefore; the ratio of the '\n",
" 'variation in the distances to the absolute values that is referred to as '\n",
" \"Contrast(d) is given by: Dmar Dmin Contrast (d) = '12/d. Therefore, the \"\n",
" 'overall sum of the point-to-point jumps reflects the aggregate change '\n",
" '(distance) from one point to another (distant) point more accurately than a '\n",
" 'straight-line distance between the points. Such distances are referred to as '\n",
" 'geodesic distances: In the case of Fig: 3.4, the only way to walk from A to '\n",
" 'B with short point-to-point jumps is to walk along the entire elliptical '\n",
" 'shape of the data distribution while passing C along the way: Therefore, A '\n",
" 'and B are actually the farthest pair of data points (from A; B, and C) on '\n",
" 'this basis! pj nu]ui ni tnj nin; (u? ui ~ 2p u; + 03 uj ni + nj ninj Ilei = '\n",
" \"1ill? ni + nj Ward's measure is therefore a weighted version of the mean \"\n",
" 'distance measure, since if we use Euclidean distance, the mean distance in '\n",
" '(14.2) can be rewritten as S(pi, Wj) = Ilei ~ kill? MULTIDIMENSIONAL DATA 65 '\n",
" '1.6 3.5 L1 (MANHATTAN) L2 (EUCLIDIAN) L4 L8 1.4 1.2 2.5 1 1 0.8 0.6 1 2 1 5 '\n",
" '0.4 0.5 0.2 100 200 300 400 500 600 700 800 900 DATA DIMENSIONALITY 100 10 '\n",
" '20 30 40 50 60 70 80 DATA DIMENSIONALITY 90 100 Contrasts with '\n",
" 'dimensionality (6) Contrasts with norms Figure 3.1: Reduction in distanc...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 6 of 8: function homogeneity'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('the problem of feature selection is intimately related to that of '\n",
" 'determining the inherent clustering tendency of a set of features. '\n",
" 'Typically, a subset of frequent subsequences should be selected, so as to '\n",
" 'maximize coverage and minimize redundancy. there are two primary classes of '\n",
" 'models : 1 Filter models: in this case, a score is associated with each '\n",
" 'feature with the use of similarity-based criterion. 2 Feature selection '\n",
" 'methods determine subsets of subgraphs that maximize the underlying '\n",
" 'clustering tendency ; where external validation criteria, such as labels; '\n",
" 'are not available for model selection. [505]-the bibliographic notes contain '\n",
" 'specific pointers to these methods : 3_ Represent each sequence in the '\n",
" 'database as a \"bag ( marized)subsequences (FSS ) from fs\"that it contains. '\n",
" '4.4 the k-medoids algorithm also uses the notion of representatives, its '\n",
" 'algorithmic structure is different from the generic k -representatives '\n",
" 'algorithm of Fig: 6.2.5 The kullback-Leibler algorithm has three main steps '\n",
" 'corresponding to addition of new clusters, assignment of sequences to '\n",
" 'clusters, and elimination of clusters. 5.8 the longest common subsequence '\n",
" '(LCSS)is a similarity function because higher values indicate greater '\n",
" 'similarity')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('Feature selection is generally more difficult for unsupervised problems, '\n",
" 'such as clustering; where external validation criteria, such as labels; are '\n",
" 'not available for feature selection. Intuitively; the problem of feature '\n",
" 'selection is intimately related to that of determining the inherent '\n",
" 'clustering tendency of a set of features. Feature selection methods '\n",
" 'determine subsets of features that maximize the underlying clustering '\n",
" 'tendency. There are two primary classes of models for performing feature '\n",
" 'selection: 1 Filter models: In this case, a score is associated with each '\n",
" 'feature with the use of a similarity-based criterion. A subsequence is '\n",
" 'different from a substring in that the values of the subsequence need not be '\n",
" 'contiguous, whereas the values in the substring need to be contiguous. '\n",
" 'Consider the sequences agb fcgdhei and afbgchdiei. In this case, ei is a '\n",
" 'substring of both sequences and also a subsequence. However, abcde and fgi '\n",
" 'are subsequences of both strings but not substrings. Clearly, subsequences '\n",
" 'of longer length are indicative of a greater level of matching between the '\n",
" 'strings. Unlike the edit distance; the longest common subsequence (LCSS) is '\n",
" 'a similarity function because higher values indicate greater similarity. '\n",
" 'This is true for both definitions of <CUR>-diversity: Lemma 20.3.1 (Entropy '\n",
" '(-diversity monotonicity) If @ table {s entropy <CUR>-diverse; then any '\n",
" 'generalization of the table is entropy <CUR>-diverse as well. Lemma 20.3.2 '\n",
" '(Recursive (c, C)-diversity monotonicity) If a table is recursive (c, 0) - '\n",
" 'diverse, then any generalization of the table is recursive (c, '\n",
" '<CUR>)-diverse as well. The reader is advised to work out Exercises 9(a) and '\n",
" '(b), which are related to these results: Thus; <CUR>-diversity exhibits the '\n",
" 'same monotonicity property exhibited by k-anonymity algo rithms. In the '\n",
" 'former case, higher values indicate greater proximity Some common methods '\n",
" 'for computing the similarity between a pair of sequences are as follows: 1_ '\n",
" 'Simple matching coeffic...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 7 of 8: translation invariance'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('the determination of time series and discrete-sequence similarity measures '\n",
" 'is closely related because the latter can be considered the categorical '\n",
" 'version of the former. Euclidean metrics are invariant to axis rotation in '\n",
" 'which each dimension represents a time stamp. it is important to remember '\n",
" 'that these normalization issues may not be relevant to every application. '\n",
" 'Some applications may require only translation, only scaling, or neither ; '\n",
" 'while other applications will require both : in fact; in some cases, the '\n",
" 'wrong choice of3.4.5,the Mahalanobis distance is similar to the traditional '\n",
" 'distance, except that it normalizes the data on the basis of interattribute '\n",
" 'correlations_ for example, if the axis system were to be rotated to its '\n",
" 'principal directions ( i.e. total agreement with the partitioning ) then '\n",
" 'there would have no (second order) inter attribute correlations. 3.9.3 '\n",
" 'general Comments although cluster validation is a widely studied problem in '\n",
" 'the clustering literature ; most methods for cluster rejection are rather '\n",
" 'imperfect. this measure is an asymmetric in terms of true positives and '\n",
" 'negatives, since it ignores the true negatives in other words, it emphasizes '\n",
" 'the similarity between the point pairs that belong together in both the '\n",
" 'ground-truth partitionings, but it discounts the non -')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('Cosine Similarity The cosine of the angle between Xi and Xj is given as Xi '\n",
" 'Xj COS = Ilxill IlxjE S d Jaccard Coefficient The Jaccard Coefficient is a '\n",
" 'commonly used similarity mea- sure between two categorical points. This '\n",
" 'metric is a generalization of the Euclidean measure, and stretches the '\n",
" 'distance values along the principal components according to their variance. '\n",
" 'A more sophisticated approach; referred to as ISOMAP, uses nonlinear '\n",
" 'embeddings to account for the impact of nonlinear data distributions Local '\n",
" 'normalization can often provide more effective measures when the '\n",
" 'distribution of the data is heterogeneous. Other data types such as '\n",
" 'categorical data, text, temporal, and graph data present further challenges. '\n",
" 'The determination of time-series and discrete-sequence similarity measures '\n",
" 'is closely related because the latter can be considered the categorical '\n",
" 'version of the former. total agreement with the partitioning T), the Jaccard '\n",
" 'Coefficient has value one, since in that case there are no false positives '\n",
" 'or false negatives. Jaccard Coefficient is asymmetric in terms of the true '\n",
" 'positives and negatives, since it ignores the true negatives In other words, '\n",
" 'it emphasizes the similarity in terms of the point pairs that belong '\n",
" 'together in both the clustering and ground-truth partitioning, but it '\n",
" 'discounts the point pairs that do not belong together. It is important to '\n",
" 'remember that these normalization issues may not be relevant to every '\n",
" 'application. Some applications may require only translation, only scaling, '\n",
" 'or neither of the two. Other applications may require both: In fact; in some '\n",
" 'cases, the wrong choice of 3.4. The Mahalanobis distance is similar to the '\n",
" 'Euclidean distance, except that it normalizes the data on the basis of the '\n",
" 'interattribute correlations_ For example, if the axis system were to be '\n",
" 'rotated to the principal directions of the data (shown in Fig: 3.3) then the '\n",
" 'data would have no (second order) interattribute correlations. In the '\n",
" 'special case where p 2...')\n",
"\n",
"Completed Summary Search - 10.13.2021_00-25\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Vs8H1v3eCLcP"
},
"source": [
"run other lists: "
]
},
{
"cell_type": "code",
"metadata": {
"id": "tvaRMyjHCKoN",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000,
"referenced_widgets": [
"3d810aed1b33438e99487304623a8907",
"bbc0ac56f30548f9b88421e0022142f4",
"54cdfba1ddb64823abf9d670fe8d1781",
"e5901d65bb3445beb1109db43c36bf82",
"eddc90cb8c244d7d97a7ce691641c6f2",
"e1fe58d4071a4f6ea3d7573108e6c039",
"548828504bac43c18cd2a799ca510336",
"9fdb181e7d3e47b99f4deb868fda0e85",
"27fc39e071ed41a1954d0e6c4ebf1e0e",
"1db1e2bcdac04ed28c2aa1885b65f5fa",
"1d1bc375ff9a4032a3b6a9438ba0b7f4"
]
},
"outputId": "3dac1827-1d5b-4aec-e596-0becbb574a08"
},
"source": [
"import time\n",
"\n",
"other_lists = {\n",
" # define any additional term lists here\n",
" \"terms from pattern recog textbook\": PR_ML,\n",
" # \"terms from past exams\": past_exams,\n",
"}\n",
"if run_search_term_list and len(other_lists) > 0:\n",
" # iterates through additional lists if any\n",
"\n",
" for key, value in other_lists.items():\n",
"\n",
" search_for_terms(\n",
" sumsearch_pipe,\n",
" value,\n",
" k_search=50, # decrease_if_crash\n",
" add_text=key + \"_\",\n",
" export_txt=False,\n",
" )\n",
" print(\"\\n\\n\\n\\n Moving to next term list {} \\n\\n\\n\".format(time.time()))"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3d810aed1b33438e99487304623a8907",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"getting defs for search_terms...: 0%| | 0/26 [00:00<?, ?it/s]"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"------------------\n",
"\n",
"'search_term 0 of 26: data set'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('a random forest is a classifier consisting of a collection of decision '\n",
" 'trees, where each tree is constructed by applying an algorithm on the '\n",
" 'training set and an additional random vector ; 0, where 0 is sampled i.i.d. '\n",
" 'from some distribution. here we describe one particular option: we generate '\n",
" '0 as follows. 1_ Error rate : let p be the fraction of the instances in a '\n",
" 'set of data points s belonging to the dominant class. then, the error rate '\n",
" 'is Simply 1 ~p. 2_ nondependency- oriented data typically refers to '\n",
" 'multidimensional data. 3_ (Multivariate Discrete Sequence Data) a discrete '\n",
" 'sequence of length n and dimensionality d contains d discrete feature values '\n",
" 'at each of n different time stamps t1 tn for exam- ple, there are 19 '\n",
" 'frequent itemsets contained in tids 2,4, and 5, SO t(bcE) = 245 and '\n",
" 'sUp(BCe)I = 3.2. 4 _ ( Multivariate & itemset mining 241 ) a subset of '\n",
" 'k-item sets may contain less than 50 items compared to a typical supermarket '\n",
" 'database with tens of thousands of items @x;lyj = c}, and let |Dil = ni '\n",
" 'denote the number')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('19.1 Decision Trees decision tree consists of internal nodes that represent '\n",
" 'the decisions corresponding to the hyperplanes Or split-points (i.e , which '\n",
" 'half-space a given point lies in) , and leaf nodes that represent regions Or '\n",
" 'partitions of the data space; which are labeled with the majority class. A '\n",
" 'region is characterized by the subset of data points that lie in that '\n",
" 'region. THE BASIC DATA TYPES Table 1.l: An example of a multidimensional '\n",
" 'data set Name Age Gender Race ZIP code John S_ 45 M African American 05139 '\n",
" 'Manyona L 31 Native American 10598 Sayani A 11 East Indian 10547 \" Jack M. '\n",
" '56 Caucasian 10562 Wei L. 63 Asian 90210 1.3.1 Nondependency-Oriented Data '\n",
" 'This is the simplest form of data and typically refers to multidimensional '\n",
" 'data. This data typically contains a set of records. A record is also '\n",
" 'referred to as a data point, instance, example, transaction, entity, tuple, '\n",
" 'object; or feature-vector; depending on the application at hand. Definition '\n",
" '1.3.3 (Multivariate Discrete Sequence Data) A discrete sequence of length n '\n",
" 'and dimensionality d contains d discrete feature values at each of n '\n",
" 'different time stamps t1 tn Each of the n components Yi contains d discrete '\n",
" 'behavioral attributes (yd yd) collected at the ith time-stamp. A similar '\n",
" 'grid discretization (as designed for the case of frequent trajectory '\n",
" 'patterns) can be used for preprocessing: However; in this case, a somewhat '\n",
" 'different vertical) repre- sentation is used for the locations of the '\n",
" 'different individuals in the grid regions at different times. For each grid '\n",
" 'region and time-interval pair, a list of person identifiers (0r trajectory '\n",
" 'identifiers, is determined. Thus; for the grid region EP and time interval '\n",
" '5, if the persons 3, 9, and 11 are present, then the corresponding set is '\n",
" 'constructed: 16.3. A random forest is a classifier consisting of a '\n",
" 'collection of decision trees, where each tree is constructed by applying an '\n",
" 'algorithm A on the training set S and an additional random vector; 0, where '\n",
" '0 is...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 1 of 26: gaussian distribution'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('the kullback-Leibler divergence is a special case of two more general '\n",
" 'classes of divergences called the bregman and f-divergence. just like the '\n",
" 'Euclidean distance is defined as a metric ( section 3.3 ), the variance - to '\n",
" '- variance correlation function can be written in the form of a probability '\n",
" 'distribution that maximizes the entropy for a given variance (or covariance '\n",
" ') any linear transformation of an univariate random variable is again a '\n",
" 'gaussian : the conjugate prior distribution for both the mean p and the '\n",
" 'precision are unknown ; and is also referred to as the normal-Wishart '\n",
" 'distribution which is simply another name for the Gaussian: in this book; we '\n",
" 'use the conventional use of the symbol n to denote this distribution. 2.5 '\n",
" '@xmath0 # 1([#1 ] ) = cmssi10 at 10pt@fileswauxout gsave newpath 20 20 '\n",
" 'moveto fixedpath 2 setlinewidth g save.4 setgraygeometricaltempbox '\n",
" '<EMAIL>#2][#3]#4 [ # 4 ] +')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('2.3_ The Gaussian Distribution The Gaussian; also known as the normal '\n",
" 'distribution; is a widely used model for the distribution of continuous '\n",
" 'variables. In the case of a single variable x, the Gaussian distribution can '\n",
" 'be written in the form 1 N(wlp,o2) = exp (2to2)1/2 (x _ p)? 202 (2.42) where '\n",
" 'p is the mean and 02 is the variance. Normal The normal distribution is '\n",
" 'simply another name for the Gaussian: In this book; we use the term Gaussian '\n",
" 'throughout, although we retain the conventional use of the symbol N to '\n",
" 'denote this distribution. For consistency, we shall refer to the normal- '\n",
" 'gamma distribution as the Gaussian-gamma distribution, and similarly the '\n",
" 'normal- Wishart is called the Gaussian-Wishart. Just like the Euclidean '\n",
" 'distance is a special case of a metric (Section 3.3) , the Kullback-Leibler '\n",
" 'divergence is a special case of two more general classes of divergences '\n",
" 'called Bregman divergences and f-divergences. The study of divergences is '\n",
" 'beyond the scope of this book, and we refer for more details to the recent '\n",
" 'book by Amari (2016), one of the founders of the field of information '\n",
" 'geometry 6.5 Gaussian Distribution The Gaussian distribution is the most '\n",
" 'well-studied probability distribution for continuous-valued random '\n",
" 'variables. It is also referred to as the normal normal distribution '\n",
" 'distribution. It comprises the product of a Gaussian distribution for p, '\n",
" 'whose precision is proportional to A, and a gamma distribution over A p(p, '\n",
" 'Alpo, B,a,b) = N (plpo, (BA)-1) Gam(Aa,b)_ B.52) Gaussian-Wishart This is '\n",
" 'the conjugate prior distribution for a multivariate Gaussian N (xlu, 4) in '\n",
" 'which both the mean p and the precision are unknown; and is also called the '\n",
" 'normal-Wishart distribution: It comprises the product of a Gaussian '\n",
" 'distribution for p, whose precision is proportional to A and a Wishart '\n",
" 'distribution over ^ p(p, Alpo, 8, W,v) = N (ulpo, (B1)-1) W(AIW,v). The '\n",
" 'Gaussian is the distribution that maximizes the entropy for a given variance '\n",
" '(or covariance) Any linear tra...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 2 of 26: log likelihood function'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('the maximum likelihood principle was studied by Ronald Fisher in the '\n",
" 'beginning of the 20th century. it has been shown that for most smooth '\n",
" 'functions, when the number of data points goes to infinity ; the estimate '\n",
" 'asymptotically converges to the true density value, provided that the width '\n",
" 'h is chosen appropriately. there are many excellent books on the generative '\n",
" 'and bayesian approaches to machine learning : see, for example, (Bishop '\n",
" '2006, Koller & Friedman 2009, MacKay 2003, Murphy 2012, Barber 2012 ). we '\n",
" 'have also described several specific algorithms for implementing the maxim '\n",
" 'likelihood under dif- ferent assumptions on an underlying data distribution, '\n",
" 'in particular, Naive Bayes, LDA, and EM: 24.7 bibliographic Remarks [ '\n",
" 'section ] [ thm]probabilistic modeling, inference, statistical mechanics, '\n",
" 'computational biology, computer science + * ams subject classification : * '\n",
" '90c25, 92d05, 60j10, 62g20, 65k35, 82e30 + keywords : probabilistics, '\n",
" 'general statistics, probability theory, model selection, correlation matrix, '\n",
" 'gaussian random variables, covariance, correlations, normality, conditional '\n",
" 'distributions, linear transformation, convolutional priors,')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('We have also described several specific algorithms for implementing the '\n",
" 'maximum likelihood under dif- ferent assumptions on the underlying data '\n",
" 'distribution, in particular, Naive Bayes, LDA, and EM: 24.7 Bibliographic '\n",
" 'Remarks The maximum likelihood principle was studied by Ronald Fisher in the '\n",
" \"beginning of the 20th century: Bayesian statistics follow Bayes' rule, which \"\n",
" 'is named after the 18th century English mathematician Thomas Bayes. There '\n",
" 'are many excellent books on the generative and Bayesian approaches to '\n",
" 'machine learning: See, for example, (Bishop 2006, Koller & Friedman 2009, '\n",
" 'MacKay 2003, Murphy 2012, Barber 2012) . It has been shown that for most '\n",
" 'smooth functions Kh(:), when the number of data points goes to infinity; the '\n",
" 'estimate asymptotically converges to the true density value, provided that '\n",
" 'the width h is chosen appropriately. The density at each data point is '\n",
" 'computed with- out including the point itself in the density computation: '\n",
" 'The value of the density is reported as the outlier score. Low values of the '\n",
" 'density indicate greater tendency to be an outlier. Density-based methods '\n",
" 'have similar challenges as histogram- and grid-based techniques. For '\n",
" 'example; consider the problem of estimating the mean of a Gaussian variable '\n",
" 'of unit variance. We saw previously that the maximum likelihood estimator is '\n",
" 'the average: j = m Zi Bi. Let p* be the optimal parameter. Then; PuEx] E '\n",
" '[e(,x) = e(p* , .)] = E log c~N(u*,1) c~N(u*,1) Pe[x] E (or-p\")? + {(r ~ '\n",
" 'p)?) x~N(p*,1) 2 62+( -p) E [x] 2 x~N(p*,1) 2 62+(\" - pv\" 2 1 2( = p*)2. '\n",
" 'Table 6.2 lists examples for conjugate priors for the parameters of some '\n",
" 'standard likelihoods used in probabilistic modeling: Distributions such as '\n",
" 'The Gamma prior is Multinomial, inverse Gamma; inverse Wishart; and '\n",
" 'Dirichlet can be found conjugate for the precision (inverse in any '\n",
" 'statistical text; and are described in Bishop (2006), for example variance) '\n",
" 'in the The Beta distribution is the conjugate prior for the parameter /l in '\n",
" 'both univaria...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 3 of 26: linear regression model'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('maximum likelihood estimation has a rich history and was originally proposed '\n",
" 'by Sir Ronald Fisher in the 1930s. an alternative view of this is to '\n",
" 'consider likelihoods that are from the ex- ponential family. the class of '\n",
" 'models, which have linear dependence between parameters and data, and have '\n",
" 'potentially nonlin- ear transformation 4 (called a link function ), is '\n",
" 'referred to as generalized linear models (Agresti, 2002, chapter 4)_ we will '\n",
" 'expand upon the idea of a probabilistic model in section 8.4.6 where we '\n",
" 'introduce the concept of multiple outcome shrinkage and selection methods '\n",
" 'for regression. + * key words : * linear regression, least squares fit, '\n",
" 'lasso, least angle regression 3.3.2 - 2.7 - 1.5 - 0.25 -0.30 -1.50 -10pt '\n",
" '-20pt # 1#2#3#4#1 * # 2 *, # 3 ( # 4 ) @xmath0 <CUR> ] = cmbx10 at 10:55 am '\n",
" 'msc2010 subject classification : primary 60k35 ; secondary 60g05, 60j60, '\n",
" '62e15, 82b40 keywords : inference, machine learning, randomness, error '\n",
" 'correction,')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('(3.20) 52 3 Linear Methods for Regression The first term is the variance, '\n",
" 'while the second term is the squared bias. The Gauss-Markov theorem implies '\n",
" 'that the least squares estimator has the smallest mean squared error of all '\n",
" 'linear estimators with no bias. However there may well exist a biased '\n",
" 'estimator with smaller mean squared error_ Such an estimator would trade a '\n",
" 'little bias for a larger reduction in variance_ Biased estimates are '\n",
" 'commonly used. 84 3 Linear Methods for Regression 3.7 Multiple Outcome '\n",
" 'Shrinkage and Selection As noted in Section 3.2.4, the least squares '\n",
" 'estimates in a multiple-output linear model are simply the individual least '\n",
" 'squares estimates for each of the outputs. To apply selection and shrinkage '\n",
" 'methods in the multiple output case one could apply a univariate technique '\n",
" 'individually to each outcome Or si- multaneously to all outcomes. An '\n",
" 'alternative view of this is to consider likelihoods that are from the ex- '\n",
" 'ponential family (Section 6.6)_ The class of models, which have linear '\n",
" 'dependence between parameters and data, and have potentially nonlin- ear '\n",
" 'transformation 4 (called a link function) , is referred to as generalized '\n",
" 'linear models (Agresti, 2002, chapter 4). Maximum likelihood estimation has '\n",
" 'a rich history and was originally proposed by Sir Ronald Fisher in the '\n",
" '1930s. We will expand upon the idea of a probabilistic model in Section 8.4. '\n",
" 'Computation of the lasso via the LAR algorithm has the same order of '\n",
" 'computation as a least squares fit. 94 3 Linear Methods for Regression '\n",
" 'Bibliographic Notes Linear regression is discussed in many statistics books, '\n",
" 'for example, Seber (1984) , Weisberg (1980) and Mardia et al. (1979). Ridge '\n",
" 'regression was introduced by Hoerl and Kennard (1970), while the lasso was '\n",
" 'proposed by Tibshirani (1996) Around the same time, lasso-type penalties '\n",
" 'were pro- posed in the basis pursuit method for signal processing (Chen et '\n",
" 'al. 1998) The least angle regression procedure was proposed in Efron et al. '\n",
" '3.4.4 Least Angl...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 4 of 26: graphical models'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('complex computations, required to perform inference and learning in sophis- '\n",
" 'ticated models, can be expressed in terms of graphical manipulations, in '\n",
" 'which underlying mathematical expressions are carried along implicitly : '\n",
" 'graph comprises nodes (also called vertices ) connected by links ( also '\n",
" 'known as edges or arcs): each node represents a random variable ( o1 group '\n",
" 'of random variables ), and the links express probabilistic relation- ships '\n",
" 'between these variables_ the graph then captures the process by which the '\n",
" 'joint distribution over all of the data is decomposed into a product of '\n",
" 'factors each depending oly on a subset of those variables. 14.4 tree-based '\n",
" 'models there are various simple, but widely used, models that work by '\n",
" 'partitioning the input space into cuboid regions, whose edges are aligned '\n",
" 'with the axes, and then assigning a simple model (for example, a constant ) '\n",
" 'to each region: here we focus on one particular tree - based framework '\n",
" 'called classification and regression trees, or cART (Breiman et al., 1984 ; '\n",
" 'quinlan, 1986; Quinlan; 1993). 13.8 graphical models representing the '\n",
" 'creation of images of objects have independent prior probabilities : the '\n",
" 'image (a vector of pixel intensities ) has a probability distribution that '\n",
" 'is dependent on the identity')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('In this case, the values of the pixels represent the behavioral attributes, '\n",
" 'and the spatial locations of these pixels represent the contextual '\n",
" 'attributes. The behavioral attributes in spatial data may present themselves '\n",
" 'in a variety of ways, depending on the application domain: 1. For some types '\n",
" 'of spatial data, such as images, the analysis may be performed on the '\n",
" 'contour of a specific shape extracted from the data_ For example, in Fig: '\n",
" '16.3, the contour of the insect may be extracted and analyzed with respect '\n",
" 'to other images in the data. GRAPHICAL MODELS Figure 8.8 A graphical model '\n",
" 'representing the process by which Object images of objects are created, in '\n",
" 'which the identity of an object (a discrete variable) and the position and '\n",
" 'orientation of that object (continuous variables) have independent prior '\n",
" 'probabilities: The image (a vector of pixel intensities) has a probability '\n",
" 'distribution that is dependent on the identity of the object as well as on '\n",
" 'its position and orientation. 14.4 Tree-based Models There are various '\n",
" 'simple, but widely used, models that work by partitioning the input space '\n",
" 'into cuboid regions, whose edges are aligned with the axes, and then '\n",
" 'assigning a simple model (for example, a constant) to each region: They can '\n",
" 'be viewed as a model combination method in which only one model is '\n",
" 'responsible for making predictions at ay given point in input space. Complex '\n",
" 'computations, required to perform inference and learning in sophis- ticated '\n",
" 'models, can be expressed in terms of graphical manipulations, in which '\n",
" 'underlying mathematical expressions are carried along implicitly: graph '\n",
" 'comprises nodes (also called vertices) connected by links (also known as '\n",
" 'edges or arcs): In a probabilistic graphical model, each node represents a '\n",
" 'random variable (O1 group of random variables), and the links express '\n",
" 'probabilistic relation- ships between these variables_ The graph then '\n",
" 'captures the way in which the joint distribution over all of the random '\n",
" 'variables can be decomp...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 5 of 26: latent variables'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('supervised event detection is one in which the class labels are associated '\n",
" 'with the timestamps rather than the full series. while it is possible in '\n",
" 'principle to define the problem with a balanced distribution of labels, this '\n",
" 'is rarely the case in application-specific settings. therefore, the '\n",
" 'discussion in this subsection will focus only on the imbalanced label '\n",
" 'distribution scenario. two kinds of change points almost always begin with '\n",
" 'an individual outlier based on a specific time window : 1. changes in the '\n",
" 'aggregate trends of the multidimensional data for example, an unusual event '\n",
" 'such as a terrorist attack may lead to a burst of news stories on that '\n",
" 'specific topic: 2. these aggregated outliers are referred to as collective '\n",
" 'outliers because they are defined by combining the patterns from multiple '\n",
" 'data items. 14.7.1 Supervised Event Detection and 13.2.5 The Viterbi '\n",
" 'algorithm in many applications of hidden Markov models, the latent variables '\n",
" 'have some meaningful interpretation, and s0 it turns out to be often of '\n",
" 'interest to find the most probable sequence of continuous streams of time '\n",
" 'series data objects for a given observation sequence. when more than one '\n",
" 'behavioral attribute is identified with each series ; the corresponding '\n",
" 'series can be characterized as either a nonstationary or a noisy series')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('14.7.1 Supervised Event Detection The problem of supervised event detection '\n",
" 'is one in which the class labels are associated with the timestamps rather '\n",
" 'than the full series. In most cases, one or more of the class labels are '\n",
" 'rare; and the remaining labels correspond to the \"normal\" periods. While it '\n",
" 'is possible in principle to define the problem with a balanced distribution '\n",
" 'of labels, this is rarely the case in application-specific settings. '\n",
" 'Therefore, the discussion in this subsection will focus only on the '\n",
" 'imbalanced label distribution scenario. These rare class labels correspond '\n",
" 'to the events in the underlying data. Such an outlier is also referred to as '\n",
" 'a novelty: 2 . The second is based on changes in the aggregate trends of the '\n",
" 'multidimensional data For example, an unusual event such as a terrorist '\n",
" 'attack may lead to a burst of news stories on a specific topic: This '\n",
" 'represents an aggregated outlier based on a specific time window. The second '\n",
" 'kind of change point almost always begins with an individual outlier of the '\n",
" 'first type. These should be selected in the training phase, SO as to '\n",
" 'maximize the differences in the alarm level between the primary events and '\n",
" 'the normal periods. To learn the coefficients W1 ad in the training phase; '\n",
" 'the composite alarm level is aver- aged at the timestamps Ti T for all '\n",
" 'primary events of interest. Note that the composite alarm level at each '\n",
" 'timestamp Ti is an algebraic expression; which is a linear function of the '\n",
" 'coefficients Q1 @d according to Eq: 14.24. Real-time data is also collected '\n",
" 'from patients in inten- sive care units (ICU) to monitor their condition: '\n",
" 'Financial market data: Financial data, such as stock prices; is often '\n",
" 'temporal. Other forms of temporal data include commodity prices; industrial '\n",
" 'trends, and economic indicators. In general, temporal data may be either '\n",
" 'discrete or continuous. For example, Web log data contains a series of '\n",
" 'discrete events corresponding to user clicks, whereas environmental data may '\n",
" 'contain a ...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 6 of 26: error function'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('ensemble analysis can be best understood by examining the different '\n",
" 'components of the error of a classifier as discussed in statistical learning '\n",
" 'theory. there are three primary components to this error : 1 ) bias ; 2 ) '\n",
" 'overfitting ; 3 ) regularization, which is a way to compromise between '\n",
" 'accurate solution of empirical risk minimization and the size or complexity '\n",
" 'of more complex models with lots of parameters. Overfitting typically occurs '\n",
" 'if the underly- ing model (or its parametrization) is overly flexible and '\n",
" 'expressive; see Section 8.6.4 that the term % Ilwll? arises directly from '\n",
" 'the optimization problem. here, we will need to look at different '\n",
" 'loss/objective functions (they determine what a \"good\" fit is) and '\n",
" 'optimization algorithms that allow us to minimize this loss. Examples '\n",
" 'include the following: the degree of polynomial in a regression setting, '\n",
" 'number of components in mixture model, network architecture of an (deep) '\n",
" 'neural network, type of kernel in support vector machine, dimensionality of '\n",
" 'latent space in PCA The learning rate (schedule) in an optimization '\n",
" 'algorithm In parametric models, the number Rasmussen and Ghahramani (2001) '\n",
" \"showed that it is automatic Occam'<CUR>s of parameter is razor does not \"\n",
" 'necessarily')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('We define L to be the number of mistakes made by SOA and we define {i1,12, '\n",
" 'iL} to be the set of rounds in which SOA made the mistakes. Now, consider '\n",
" 'the Expert (i1,12= iL) running o the sequence X1 X2, XT By construction; the '\n",
" 'set Vt maintained by Expert(i1, 12, iL) equals the set Vt maintained by SOA '\n",
" 'when running on the sequence (X1, h(x1)), (XT, h(xT)) . The predictions of '\n",
" 'SOA differ from the predictions of h if and only if the round is in {11,62, '\n",
" 'iL}. The regularization parameter trades regularization off minimizing the '\n",
" 'loss on the training set and the magnitude of the pa- parameter rameters 0_ '\n",
" 'It often happens that the magnitude of the parameter values becomes '\n",
" 'relatively large if we run into overfitting (Bishop, 2006). The '\n",
" 'regularization term is sometimes called the penalty term, which bi- penalty '\n",
" 'term ases the vector 0 to be closer to the origin: The idea of '\n",
" 'regularization also appears in probabilistic models as the prior probability '\n",
" 'of the parameters. The rationale for ensemble analysis can be best '\n",
" 'understood by examining the different components of the error of a '\n",
" 'classifier, as discussed in statistical learning theory. There are three '\n",
" 'primary components to the error of a classifier: 1 Bias: Every classifier '\n",
" 'makes its own modeling assumptions about the nature of the decision boundary '\n",
" 'between classes. For example, a linear SVM classifier assumes that the two '\n",
" 'classes may be separated by a linear decision boundary. This is, of course, '\n",
" 'not true in practice. The Akaike information Akaike information criterion '\n",
" '(AIC) (Akaike, 1974) criterion log p(a | 0) = M (8.48) corrects for the bias '\n",
" 'of the maximum likelihood estimator by addition of a penalty term to '\n",
" 'compensate for the overfitting of more complex models with lots of '\n",
" 'parameters. Therefore, we need to somehow bias the search for the minimizer '\n",
" 'of empirical risk by introducing a penalty term, which makes it harder for '\n",
" 'the optimizer to return an overly flexible predictor: In machine learning, '\n",
" 'regularization the penalty...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 7 of 26: training data'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('classification is sometimes referred to as supervised learning because an '\n",
" 'example data set is used to learn the structure of the groups, just as a '\n",
" 'teacher supervises his or her students towards a specific goal. online '\n",
" 'training allows the network to handle very large training sets, and also to '\n",
" 'update the weights as new observations come in. Example 7.4 consider the '\n",
" 'classification prediction algorithm Memorize defined as follows. given a '\n",
" 'test point i, it predicts the majority label among all labeled instances of '\n",
" '& that exist in the training sample (and some fixed default label if no '\n",
" 'instance of x appears in this training set ). it is possible to show ( see '\n",
" 'Exercise 6) that the memorize algorithm is universally consistent for every '\n",
" 'countable domain X and a finite label set v (w.r.t. the zero-one loss). 2.9 '\n",
" 'consider a linear regression model with p parameters, fit by least squares '\n",
" 'to a set of training data (11,41), (en @xip the inputs for the ith training '\n",
" 'case, let Yi be a response measurement_ the predictions are based on the '\n",
" 'bootstrapped sample containing previously solved cases, where the joint '\n",
" 'values of all variables are known_ This is usually characterized by some '\n",
" 'loss function l(y, 9) =')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('The fraction of the labeled data points included at least once in the '\n",
" 'training data is therefore 1 _ 1/e ~ 0.632. The training model M is '\n",
" 'constructed on the bootstrapped sample containing duplicates. The overall '\n",
" 'accuracy is computed using the original set of full labeled data as the test '\n",
" 'examples. The estimate is highly optimistic of the true classifier accuracy '\n",
" 'because of the large overlap between training and test examples. The '\n",
" 'previously unseen data points that need to be classified are collectively '\n",
" 'referred to as the test data set The algorithm that creates the training '\n",
" 'model for prediction is also sometimes referred to as the learner: '\n",
" 'Classification is, therefore, referred to as supervised learning because an '\n",
" 'example data set is used to learn the structure of the groups, just as a '\n",
" 'teacher supervises his or her students towards a specific goal. Example 7.4 '\n",
" 'Consider the classification prediction algorithm Memorize defined as '\n",
" 'follows. The algorithm memorizes the training examples, and, given a test '\n",
" 'point I , it predicts the majority label among all labeled instances of & '\n",
" 'that exist in the training sample (and some fixed default label if no '\n",
" 'instance of x appears in the training set). It is possible to show (see '\n",
" 'Exercise 6) that the Memorize algorithm is universally consistent for every '\n",
" 'countable domain X and a finite label set V (w.r.t. the zero-one loss) . The '\n",
" 'right panel shows the reference data (blue) generated uniformly over the '\n",
" 'rectangle containing the training data. The training sample was labeled as '\n",
" 'class 1, and the reference sample class 0, and a logistic regression model, '\n",
" 'using a tensor product of natural splines (Section 5.2.1) , was fit to the '\n",
" 'data. two-thirds or three- fourths) is used as the training data, and the '\n",
" 'remaining is used as the test data. The approach can be repeated several '\n",
" 'times with multiple samples to provide a final estimate The problem with '\n",
" 'this approach is that classes that are overrepresented in the training data '\n",
" 'are also underrepres...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 8 of 26: conditional mixture models'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('in this chapter, we discuss mixtures over continuous vari- ables described '\n",
" 'by unconditional density models such as gaussians and replace the component '\n",
" 'densities with conditional distributions. a further generalization to allow '\n",
" 'the mixing coeffi- cients also to depend on the inputs then we obtain a '\n",
" 'hierarchical mixture of experts model that is based on categorical data : '\n",
" 'each component of the mixture needs to reflect a set of discrete attributes '\n",
" 'rather than numerical attributes. In other words, a generative process uses '\n",
" 'the following two steps to generate each point in the d-dimensional data set '\n",
" ': 1. select a mixture component with prior probability Qi; where i <CUR> {1 '\n",
" 'k}. 2. report outliers that do not fit this mixture model are reported as '\n",
" 'outlier data points. 9.2.3 Mixtures f Bernoulli distributions ; 667 logistic '\n",
" 'regression models ( section 14.5.1 ) here we illustrate the em algorithm in '\n",
" 'a different context ; we now discuss mix- tures of discretized binary '\n",
" 'variables describing by multinomial or even a gaussian model for numeric '\n",
" 'data. 10.4 Rule-Based classifiers ; probabilistic approach to data '\n",
" 'clustering ; bayes model ; latent class analysis 11.8 rule-based '\n",
" 'nonparametric')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('SAMPLING METHODS Figure 11.12 The Gibbs sampling method requires samples to '\n",
" 'be drawn from the conditional distribution of a variable condi- tioned on '\n",
" 'the remaining variables: For graphical models, this conditional distribution '\n",
" 'is a function only of the states of the nodes in the Markov blanket. 9.3.3 '\n",
" 'Mixtures f Bernoulli distributions So far in this chapter; we have focussed '\n",
" 'on distributions over continuous vari- ables described by mixtures of '\n",
" 'Gaussians. As a further example of mixture mod- elling, and to illustrate '\n",
" 'the EM algorithm in a different context; we now discuss mix- tures of '\n",
" 'discrete binary variables described by Bernoulli distributions. This model '\n",
" 'is also known as latent class analysis (Lazarsfeld and Henry, 1968; '\n",
" 'McLachlan and Peel, 2000). Therefore, each component of the mixture needs to '\n",
" 'reflect a set of discrete attributes rather than numerical attributes. In '\n",
" 'other words, a generative mixture model of categorical data needs to be '\n",
" 'designed. Data points that do not fit this mixture model are reported as '\n",
" 'outliers. The k components of the mixture model are denoted by G1 Gk _ The '\n",
" 'generative process uses the following two steps to generate each point in '\n",
" 'the d-dimensional data set D: 1. Select a mixture component with prior '\n",
" 'probability Qi; where i <CUR> {1 k}. An alternative way to motivate the '\n",
" 'hierarchical mixture of experts model is to start with a standard '\n",
" 'probabilistic mixtures of unconditional density models such as Gaussians and '\n",
" 'replace the component densities with conditional distributions. Here we '\n",
" 'consider mixtures of linear regression models (Section 14.5.1) and mixtures '\n",
" 'of Chapter 9 14.5. Conditional Mixture Models 667 logistic regression models '\n",
" '(Section 14.5.2) In the simplest case, the mixing coeffi- cients are '\n",
" 'independent of the input variables If we make a further generalization to '\n",
" 'allow the mixing coefficients also to depend on the inputs then we obtain a '\n",
" 'mixture of experts model. In general, the Bayes model can assume any '\n",
" 'parametric form of th...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 9 of 26: mixture density network'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('community detection algorithms have built-in mechanisms to ensure that the '\n",
" 'underlying clusters are balanced. in small-world networks, the distances '\n",
" 'between different pairs of nodes is a small number that can not provide a '\n",
" 'sufficiently fine-grained indicator of similarity : rather, it is more '\n",
" 'important to use triadic closure properties of real networks, explicitly 0 '\n",
" 'implicitly, in the clustering process. any pair of clusters whose density '\n",
" 'attractors are connected to each other by a path of density at least t will '\n",
" 'be merged. this step addresses the merging of multiple density peaks, as '\n",
" 'illustrated in Fig: 6.17.18, and is analogous to the postprocessing step '\n",
" 'used in grid-based methods and dBSCAN The overall DENCLUE algorithm is also '\n",
" 'an example of such hub nodes connecting up different communities. while '\n",
" 'social networks usually have distinct community structures, these '\n",
" 'high-degree hub - nodes connect different types of communities, thereby '\n",
" 'bringing them together. 4.2.5 2.3.1 3.0.4 1.7 * keywords : * mixture of '\n",
" 'experts model ; mixtures of data points ; clustering methods ; '\n",
" 'graph-partitioning ; network analysis + * ams subject classifications ( 2010 '\n",
" ') :* primary 60j05 ; secondary 62g20 ;')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('Intuitively, fjk(i) is a fraction that indicates the level of control that '\n",
" 'node i has over nodes j and k in terms of regulating the flow of information '\n",
" 'between them Then, the betweenness centrality CB(i) is the average value of '\n",
" 'this fraction over all 2 pairs of nodes. Ci<k fik(i) CB(i) = (19.13) The '\n",
" 'betweenness centrality also lies between 0 and 1, with higher values '\n",
" 'indicating better betweenness. Unlike closeness centrality; betweenness '\n",
" 'centrality can be defined for discon- nected networks as well. Typically, '\n",
" 'social networks are formed by preferential attachment, and they exhibit '\n",
" 'power-law degree distributions: The problem of clustering social networks is '\n",
" 'challenging because of the presence of hub nodes, and the natural tendency '\n",
" 'of social networks to cluster into a single large group Therefore, most '\n",
" 'community detection algorithms have built-in mechanisms to ensure that the '\n",
" 'underlying clusters are balanced. Clustering methods are also sometimes '\n",
" 'referred to as graph-partitioning: One of the earliest clustering methods '\n",
" 'was the Kernighan-Lin method, which uses an iterative approach for '\n",
" 'clustering: Nodes are repeatedly exchanged between partitions to iteratively '\n",
" 'improve the value of the objective function. Although these simple mixtures '\n",
" 'extend the flexibility of linear models to include more com- plex (e-g-, '\n",
" 'multimodal) predictive distributions, they are still very limited: We can '\n",
" 'further increase the capability of such models by allowing the mixing '\n",
" 'coefficients themselves to be functions of the input variable, so that K Tk '\n",
" '(x)pr(tlx). k=l p(tlx) (14.53) This is known as a mixture of experts model '\n",
" '(Jacobs et al, 1991) in which the mix- ing coefficients Tk (x) are known as '\n",
" 'gating functions and the individual component densities pk (tlx) are called '\n",
" 'experts. NEURAL NETWORKS p(tlx) _ I D 11 Figure 5.20 The mixture density '\n",
" 'network can represent general conditional probability densities p(tlx) by '\n",
" 'considering a parametric mixture model for the distribution Of t whose '\n",
" 'parameter...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 10 of 26: log marginal likelihood'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('the maximum likelihood principle was studied by Ronald Fisher in the '\n",
" \"beginning of the 20th century : bayesian statistics follow Bayes' rule. it \"\n",
" 'has been shown that for most smooth functions when the number of data points '\n",
" 'goes to infinity ; the estimate asymptotically converges to the true density '\n",
" 'value, provided that the width h is chosen appropriately. we have also '\n",
" 'described several specific algorithms for implementing the minimum '\n",
" 'likelihood under dif- ferent assumptions on the underlying data '\n",
" 'distribution, in particular, @xmath0density-based methods have similar '\n",
" 'challenges as histogram- and grid- based techniques ( e.g, numerical '\n",
" 'integration ), stochastic approximations using Monte Carlo (Murphy, 2012), '\n",
" 'or a combination thereof (OHagan, 1991; Rasmussen and Ghahramani; 2003 ). + '\n",
" '* keywords : * machine learning, model selection, marginal likelihood, '\n",
" 'outlier score, gaussian variable of unit variance, linear regression, random '\n",
" 'variables, differential equations, generalization, uniform prior, '\n",
" 'conditional probability, regularized risk minimization principle, baryon - '\n",
" 'leibler \"distance \" between and p(ax + by) = n (apz + bpy\\' aP2_+ bp2')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('Remark (Computing the Marginal Likelihood) The marginal likelihood plays an '\n",
" 'important role in model selection: We need to compute Bayes factors (8.46) '\n",
" 'and posterior distributions over models (8.43). Unfortunately, computing the '\n",
" 'marginal likelihood requires us to solve an integral (8.44). This '\n",
" 'integration is generally analytically intractable, and we will have to '\n",
" 'resort to approximation techniques, e.g , numerical integration (Stoer and '\n",
" 'Burlirsch, 2002) , stochastic approximations using Monte Carlo (Murphy, '\n",
" '2012) , or Bayesian Monte Carlo techniques (OHagan, 1991; Rasmussen and '\n",
" 'Ghahramani; 2003). For example; consider the problem of estimating the mean '\n",
" 'of a Gaussian variable of unit variance. We saw previously that the maximum '\n",
" 'likelihood estimator is the average: j = m Zi Bi. Let p* be the optimal '\n",
" 'parameter. Then; PuEx] E [e(,x) = e(p* , .)] = E log c~N(u*,1) c~N(u*,1) '\n",
" 'Pe[x] E (or-p\")? + {(r ~ p)?) x~N(p*,1) 2 62+( -p) E [x] 2 x~N(p*,1) 2 62+(\" '\n",
" '- pv\" 2 1 2( = p*)2. We have also described several specific algorithms for '\n",
" 'implementing the maximum likelihood under dif- ferent assumptions on the '\n",
" 'underlying data distribution, in particular, Naive Bayes, LDA, and EM: 24.7 '\n",
" 'Bibliographic Remarks The maximum likelihood principle was studied by Ronald '\n",
" 'Fisher in the beginning of the 20th century: Bayesian statistics follow '\n",
" \"Bayes' rule, which is named after the 18th century English mathematician \"\n",
" 'Thomas Bayes. There are many excellent books on the generative and Bayesian '\n",
" 'approaches to machine learning: See, for example, (Bishop 2006, Koller & '\n",
" 'Friedman 2009, MacKay 2003, Murphy 2012, Barber 2012) . It has been shown '\n",
" 'that for most smooth functions Kh(:), when the number of data points goes to '\n",
" 'infinity; the estimate asymptotically converges to the true density value, '\n",
" 'provided that the width h is chosen appropriately. The density at each data '\n",
" 'point is computed with- out including the point itself in the density '\n",
" 'computation: The value of the density is reported as the outlier score. '\n",
" 'Lo...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 11 of 26: posterior probability'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('the posterior distribution is the quantity of interest in bayesian '\n",
" 'statistics posterior because it expresses exactly what we are interested in '\n",
" \"& after having observed y. if we observe %; we can use Bayes' theorem to \"\n",
" 'draw some conclusions about x given the observed values of y. we call p(y x) '\n",
" 'either the \"likelihood of \"measurement 3 ( given % ) or the (( \\'probability '\n",
" 'of @x given w\" but never the likelihood of model\". y (MacKay, 2003). + * '\n",
" 'keywords : * posterior, marginal likelihood, bayes law, probabilistic '\n",
" 'inference, conditional distributions, noninformative priors, gaussian and '\n",
" 'frequentist paradigms, markov - faisal learning, penalization property, '\n",
" 'probability measure, correlation function, random variables, planck s theory '\n",
" 'of general relativity, causality, differential equations, bardeen - '\n",
" 'robertson - walker equation, monte carlo method, product rule, adiabatic '\n",
" 'formulae, asymptotic consistency, quantification of uncertainty, linear '\n",
" 'transformation of a multivariate random variable, regression analysis, '\n",
" 'statistical mechanics, computer science, neural networks, computational '\n",
" 'biology, data mining, network engineering, information processing, machine '\n",
" 'learning.')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('We call p(y x) either the \"likelihood of \"measurement 3 (given %) or the (( '\n",
" '\\'probability of y given w\" but never the likelihood of model\". y (MacKay, '\n",
" '2003) . The posterior p(a y) is the quantity of interest in Bayesian '\n",
" 'statistics posterior because it expresses exactly what we are interested in, '\n",
" 'i.e., what we know about & after having observed y. 02021 M P Deisenroth, A_ '\n",
" 'A Faisal, C. S. Ong: . 186 Probability and Distributions The quantity p(y) : '\n",
" 'ply | a)p(. )da = Ex[p(y | w)] (6.27) marginal likelihood evidence is the '\n",
" 'marginal likelihood/evidence. The Gaussian is the distribution that '\n",
" 'maximizes the entropy for a given variance (or covariance) Any linear '\n",
" 'transformation of a Gaussian random variable is again Gaussian. The marginal '\n",
" 'distribution of a multivariate Gaussian with respect to a subset of the '\n",
" 'variables is itself Gaussian, and similarly the conditional distribution is '\n",
" 'also Gaussian: The conjugate prior for p is the Gaussian, the conjugate '\n",
" 'prior for A is the Wishart, and the conjugate prior for (1,4) is the '\n",
" \"Gaussian-Wishart. If we observe %; we can use Bayes' theorem to draw some \"\n",
" \"conclusions about x given the observed values of y. Bayes' theorem (also \"\n",
" \"Bayes' theorem Bayes' rule or Bayes' law) Bayes' rule likelihood prior \"\n",
" \"Bayes' law pky [w)plo p(a |y), (6.23) posterior evidence is a direct \"\n",
" 'consequence of the product rule in (6.22) since p(x, y) = p(x |y)pky) (6.24) '\n",
" 'and p(c,y) = ply | x)p(w) (6.25) so that P(y | w)p(e) p(x |y) = p(y) p(x '\n",
" '|y)p(y) 3 p(y | w)p(a) (6.26) In (6.23), P(e) is the prior, which '\n",
" 'encapsulates our subjective prior prior knowledge of the unobserved (latent) '\n",
" 'variable x before observing any data. (1.66) We can now determine w by '\n",
" 'finding the most probable value of w given the data, in other words by '\n",
" 'maximizing the posterior distribution: This technique is called maximum '\n",
" 'posterior, or simply MAP. The Bayesian view also includes the specification '\n",
" 'of the prior; which includes design choices such as conjugacy (Section '\n",
" '6.6.1) with the likelihood. Ad-...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 12 of 26: maximum'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('this book shall be available for purchase from Cambridge University Press '\n",
" 'and other standard distribution channels. no unauthorized distribution may '\n",
" 'be allowed, and that the reader may make one copy only for personal '\n",
" 'on-screen use. Please do not distribute. + * key words : * black oreo dory, '\n",
" 'research trawls, statistical estimation framework, random vari- able ( rvio '\n",
" ') package gbn in package _ 2.2.0_version 1.5-7 of package gnu - bnx in a '\n",
" 'language suitable for computer aided design ( caddis ) or graphics '\n",
" 'processing ( grapheme ).**pacs number(s ) : 02.50.+q, 05.40.-a, '\n",
" '89.65.gh,89.75.hc*keywords:**research trawl, restricted access fishery, data '\n",
" 'mining, non - invasive species, mathematical modeling, predictive modelling, '\n",
" 'computational biology, randomized control group ( prbg ), monte carlo '\n",
" 'simulations, probability theory, hypothesis testing, stochastic differential '\n",
" 'equations, markov chain renormalization tomography ( mct ) model, time '\n",
" 'series analysis, test point x ( 6.76,4.25)@xmath0 # 1#1')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('For obtaining the maximum and an enumerator you w Would have a p d d is '\n",
" 'given in which is one over and your a greater than sixty greater or equal '\n",
" 'than sixty and then you would have multiplieduper a p d d which is a place '\n",
" 'doctor. In which is one over an maxider for it does not count for the '\n",
" 'calculation of the maximum and then you just let you just see that this '\n",
" 'probability decreases as when an increase and therefore the maximum the '\n",
" 'maximum is. Note that this book shall be available for purchase from '\n",
" 'Cambridge University Press and other standard distribution channels, that no '\n",
" 'unauthorized distribution shall be allowed, and that the reader may make one '\n",
" 'copy only for personal on-screen use. CHAPTER 18. PROBABILISTIC '\n",
" 'CLASSIFICATION 527 The test point X (6.75,4.25) , corresponding to (Long _ '\n",
" 'Long) or V e13, e23) , is classified as follows 0 + 1 13 P(vlc1) = P(e1slc1) '\n",
" '. Point to nine for nine, then we lower the classification threshold to '\n",
" 'position twelve now two out of twelve positive predictions are correct so '\n",
" 'one sixth and this is sixteen point six per cent, roughly so the position '\n",
" 'now jumps to a sixteen point six per cent and we now have discovered all '\n",
" 'positive points. Please do not distribute. Feedback is Welcome. Note that '\n",
" 'this book shall be available for purchase from Cambridge University Press '\n",
" 'and other standard distribution channels, that no unauthorized distribution '\n",
" 'shall be allowed, and that the reader may make one copy only for personal '\n",
" 'on-screen use. CHAPTER 12 PATTERN AND RULE ASSESSMENT 347 maximum, and '\n",
" 'minimum to characterize X. If is a symmetric measure, then O( X1 X2) = O(X2 '\n",
" 'X1); and we have to consider only half of the rules. 4Version 1.5-7 of '\n",
" 'package gbn in R, ver _ 2.2.0_ 10.14 Illustrations 377 Research trawls '\n",
" 'Present Absent Exclusive Economic Zone 30 <CUR> 35 <CUR> 40 <CUR> 45 <CUR> '\n",
" \"50 <CUR> 100 100200 Kilometres 55 S 165 E T70 E 175 E 180 E 175 W 170 M' \"\n",
" 'FIGURE 10.18. Map of New Zealand and its surrounding exclusive economic '\n",
" 'zone, showing the location...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 13 of 26: relevance vector machine'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('in this chapter we introduce the relevance vector machine ( rVM ), a '\n",
" 'bayesian sparse ker- nel technique for regression and classification that '\n",
" 'shares many of the characteristics of support vector machines ( svm ) whilst '\n",
" 'avoiding their principal limitations additionally, it typically leads to '\n",
" 'much sparser models resulting in correspondingly faster performance on test '\n",
" 'data whilst maintaining comparable generalization erTor: in contrast to the '\n",
" 'sVM we shall find it more convenient to describe the regres- sion form first '\n",
" 'and then consider the extension to classification tasks. + + * keywords : * '\n",
" 'inference, learning algorithms, kernel techniques, correlation theory, '\n",
" 'statistical mechanics. + _ ams subject classification 2010 : _ primary 60k05 '\n",
" '; secondary 62h10, 62m25, 82b20, 92e15, 90c40, 93a30, 94a50, 95a55, 98a80, '\n",
" '91a60, 96a70, 97d12, 05c99, 12p07, 13qxx, 14j75, 15l65, 16t45, 17r06, 18n17, '\n",
" '20g35, 22g37, 24x16, 26y09, 28z18, 30f27, 37u11, 39a21,')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('An alternative sparse kernel technique, known as the relevance vector '\n",
" 'machine (RVM), is based 0n a Bayesian formulation and provides posterior '\n",
" 'proba- bilistic outputs, as well as having typically much sparser solutions '\n",
" 'than the SVM: Section 7.2 7.1. The relevance vector machine o RVM (Tipping, '\n",
" '2001) is a Bayesian sparse ker- nel technique for regression and '\n",
" 'classification that shares many of the characteristics of the SVM whilst '\n",
" 'avoiding its principal limitations Additionally, it typically leads to much '\n",
" 'sparser models resulting in correspondingly faster performance on test data '\n",
" 'whilst maintaining comparable generalization erTor: In contrast to the SVM '\n",
" 'we shall find it more convenient to introduce the regres- sion form of the '\n",
" 'RVM first and then consider the extension to classification tasks. Relevance '\n",
" 'Vector Machines Support vector machines have been used in a variety of '\n",
" 'classification and regres- sion applications Nevertheless, they suffer from '\n",
" 'a number of limitations, several of which have been highlighted already in '\n",
" 'this chapter: In particular; the outputs of an SVM represent decisions '\n",
" 'rather than posterior probabilities. Also, the SVM was originally formulated '\n",
" 'for two classes, and the extension to K 2 classes is prob- lematic. SPARSE '\n",
" 'KERNEL MACHINES X X x * X X X 3 ++ 0 Xx G#t_ + X X X X X *x* X X X Ke X XxX '\n",
" 'X +x 0 X*x X Xx X X 4 X0 xxox* X 4 Kok ++ Tk 6 kt# Xxx X X X X* X ++ X 2 2 '\n",
" '~2 2 2 Figure 7.12 Example of the relevance vector machine applied to a '\n",
" 'synthetic data set, in which the left-hand plot shows the decision boundary '\n",
" 'and the data points, with the relevance vectors indicated by circles. '\n",
" 'Preface Pattern recognition has its origins in engineering, whereas machine '\n",
" 'learning grew out of computer science. However; these activities can be '\n",
" 'viewed as two facets of the same field, and together they have undergone '\n",
" 'substantial development over the past ten years. In particular; Bayesian '\n",
" 'methods have grown from a specialist niche to become mainstream, while '\n",
" 'graphical...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 14 of 26: directed graph'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('this book shall be available for purchase from Cambridge University Press '\n",
" 'and other standard distribution channels, that no unauthorized distribution '\n",
" 'shall not be allowed, and that the reader may make one copy only for '\n",
" 'personal on-screen use. Please do not distribute. 4.1 Graph Concepts Graphs '\n",
" 'Formally, a graph is a mathematical structure consisting of a finite '\n",
" 'non-empty set of vertices o1 nodes, and a set e CV x v of edges consisting '\n",
" 'respectively of unordered pairs of edge pairs. such graphs are also called '\n",
" 'directed acyclic graphs, or dAGs: this is equivalent to the statement that '\n",
" 'there exists an ordering of the nodes such that g0 from any node to any '\n",
" 'lower numbered node. 3.2 three example graphs we begin our discussion of '\n",
" 'conditional independence properties of directed graphs by considering three '\n",
" 'simple examples each involving graphs having just three nodes. 5.5 link '\n",
" 'prediction as a Missing-Value Estimation Problem Section 18.3 of Chap.18 '\n",
" 'discusses how link predictions can be applied to user-item graphs for '\n",
" 'recommendations. in general, both the recommendation problem and the link - '\n",
" 'prediction problem may be viewed as instances of missing value estimation on '\n",
" 'matrices of different types : Recommendation algorithms are described as '\n",
" 'being inversely related to their k-nearest neighbors')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('A closed path with length t > 3 is called a cycle; i.e., a cycle begins and '\n",
" 'ends at the same vertex; and has distinct nodes. 4.1 Graph Concepts Graphs '\n",
" 'Formally, a graph G = (V,E) is a mathematical structure consisting of a '\n",
" 'finite non-empty set V of vertices O1 nodes, and a set E CV x V of edges '\n",
" 'consisting of unordered pairs of vertices. An edge from a node to itself; '\n",
" '(Ui, Ui ) , is called a loop. An undirected graph without loops is called a '\n",
" 'simple graph. A directed edge (vi, Vj) is also called an arc; and is said to '\n",
" 'be from Vi to j: We also say that Vi is the tail and vj the head of the arc_ '\n",
" 'DRAFT 2013-07-10 11:07. Please do not distribute. Feedback is Welcome_ Note '\n",
" 'that this book shall be available for purchase from Cambridge University '\n",
" 'Press and other standard distribution channels, that no unauthorized '\n",
" 'distribution shall be allowed, and that the reader may make one copy only '\n",
" 'for personal on-screen use. A path of minimum length between nodes x and y '\n",
" 'is called a shortest path; and the length of the shortest path is called the '\n",
" 'distance between x and y, denoted as d(x,y)- If no path exists between the '\n",
" 'two nodes, the distance is assumed to be d(x,y) = O_ Connectedness Two nodes '\n",
" 'Ui and Uj are said to be connected if there exists a path between them: A '\n",
" 'graph is connected if there is a path between all pairs of vertices. A '\n",
" 'connected component; Or just component, of a graph is a maximal con- nected '\n",
" 'subgraph. However , this is not the shortest path between them, which '\n",
" 'happens to be (03, V1, VU2, V6) with length 3_ Thus , the distance between '\n",
" 'them is given as d(v3, U6) = 3. Figure 4.1b shows a directed graph with 8 '\n",
" 'vertices and 12 edges. We cam see that edge (U5, U8 ) is distinct from edge '\n",
" '(v8, V5 ) . Degree The degree of a node Vi <CUR> V is the number of edges '\n",
" 'incident with it, and is denoted as d(ui) o just di_ The degree sequence of '\n",
" 'a graph is the list of the degrees of the nodes sorted in non-increasing '\n",
" 'order. A walk starting and ending at the same vertex (i.e., with ...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 15 of 26: bayesian model'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('bayesian inference is a mathematically principled framework for learning '\n",
" 'about parameters and making predictions. it has been successfully ap- plied '\n",
" 'to a variety of problems, including large-scale topic modeling ( hoff- man '\n",
" 'et al,, 2013 ), click-through-rate prediction ( minka, 2001a). in these '\n",
" 'cases, we need to resort to stochas- tic approximations, such as the markov '\n",
" 'chain Monte - carlo (MCMC) ( bishop, 2006; Barber, 2012,; Murphy, 2012), '\n",
" 'variational in- ference ( or expectation propaga- tion ; see, for example, '\n",
" 'Naive Bayes, LDA, and EM: 24.7 ). while there are many excellent books on '\n",
" 'the generative and Bayesian approaches to machine learning, here we have '\n",
" 'also described several specific algorithms for implementing the maximum '\n",
" 'likelihood under diferent assumptions on an underlying data distribution, in '\n",
" 'particular, @xmath0 <CUR> ] # 1#2#3#4#1 * # 2 *, # 3 ( # 4 ) # '\n",
" '5#6#7*#8.*#9**#10**[#11][#12],[#13\\\\ { # 14}\\\\{#15}\\\\')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('We have also described several specific algorithms for implementing the '\n",
" 'maximum likelihood under dif- ferent assumptions on the underlying data '\n",
" 'distribution, in particular, Naive Bayes, LDA, and EM: 24.7 Bibliographic '\n",
" 'Remarks The maximum likelihood principle was studied by Ronald Fisher in the '\n",
" \"beginning of the 20th century: Bayesian statistics follow Bayes' rule, which \"\n",
" 'is named after the 18th century English mathematician Thomas Bayes. There '\n",
" 'are many excellent books on the generative and Bayesian approaches to '\n",
" 'machine learning: See, for example, (Bishop 2006, Koller & Friedman 2009, '\n",
" 'MacKay 2003, Murphy 2012, Barber 2012) . The Bayesian view also includes the '\n",
" 'specification of the prior; which includes design choices such as conjugacy '\n",
" '(Section 6.6.1) with the likelihood. Ad- ditionally one could consider '\n",
" 'latent functions as priors, which results in Gaussian process classification '\n",
" '(Rasmussen and Williams, 2006, chapter 3) . For a dataset X , a parameter '\n",
" 'prior p(0) , and a likelihood function, the posterior p(X | 0)p(0) p(0 x) 3 '\n",
" 'p(x) = p(X | 0)p(0)de (8.22) p(x) Bayesian inference is obtained by applying '\n",
" \"Bayes' theorem. Moreover the propagation of parameter uncertainty to the \"\n",
" 'prediction can be valuable in decision-making systems for risk assessment '\n",
" 'and exploration in the context of data-efficient learn- ing (Deisenroth et '\n",
" 'al,, 2015; Kamthe and Deisenroth; 2018). While Bayesian inference is a '\n",
" 'mathematically principled framework for learning about parameters and making '\n",
" 'predictions, there are some prac- tical challenges that come with it because '\n",
" 'of the integration problems we need to solve; see (8.22) and (8.23). Due to '\n",
" \"the Bayes' theorem is integration in (8.44), the evidence is often hard to \"\n",
" \"compute: also called the Bayes' theorem (6.23) allows us to invert the \"\n",
" 'relationship between 3 \"probabilistic and y given by the likelihood: '\n",
" \"Therefore, Bayes' theorem is sometimes inverse_ probabilistic inverse called \"\n",
" \"the probabilistic inverse. We will discuss Bayes' theorem further i...\")\n",
"\n",
"------------------\n",
"\n",
"'search_term 16 of 26: model parameters'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('machine learning is a key task to take a model and the data to uncover '\n",
" 'estimation can be the values of the models hidden variables <CUR> given the '\n",
" 'observed variables phrased as an 1.5. in this paper, we discuss two ways for '\n",
" 'estimating model optimization problem : parameters 0 using maximum '\n",
" 'likelihood or maximum a posteriori esti- mation. (a) training data ; (b) '\n",
" 'posterior distribution over functions; (c) Samples from the posterior over '\n",
" 'function. it is helpful to phrase model selection as a hierarchical '\n",
" 'inference problem, which allows us to place a prior p(M) on the set of '\n",
" 'models. Feedback is Welcome. Please do not distribute. DATA CLASSIFICATION '\n",
" '10.4.2 Bayesian Inference Parameter 8.3.1 probabilistic classifiers 7.8.6 '\n",
" 'bayesian inference parameter 6.7.0 marginal likelihood 5.9.10 markov chain '\n",
" 'monte carlo 12.38.mh index = cmbx10 at 11pt plus 2pt minus 4pt '\n",
" 'addtoresetequationsection @=11 tempcntc citex[#1]#2@fileswauxout '\n",
" 'tempstemptempsetsametaciteforciteb:=#2 temp')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('It is a distribution that models the uncertainty of the data. In other '\n",
" 'words, once we have chosen the type of function we want as a predictor; the '\n",
" 'likelihood provides the probability of observing data x_ In a complementary '\n",
" 'view; if we consider the data to be fixed (because it has been observed) , '\n",
" 'and we vary the parameters 0, what does L(O) tell us? Only the joint '\n",
" 'distribution has this property Therefore, a probabilistic model is specified '\n",
" 'by the joint distribution of all its random variables 8.4.2 Bayesian '\n",
" 'Inference Parameter A key task in machine learning is to take a model and '\n",
" 'the data to uncover estimation can be the values of the models hidden '\n",
" 'variables <CUR> given the observed variables phrased as an 1. In Section '\n",
" '8.3.1, we already discussed two ways for estimating model optimization '\n",
" 'problem: parameters 0 using maximum likelihood or maximum a posteriori esti- '\n",
" 'mation. As its name indicates, the prior distribution should be defined by '\n",
" 'the learner prior to observing the data. DATA CLASSIFICATION 10.5 '\n",
" 'Probabilistic Classifiers Probabilistic classifiers construct a model that '\n",
" 'quantifies the relationship between the fea- ture variables and the target '\n",
" '(class) variable as a probability. There are many ways in which such a '\n",
" 'modeling can be performed. Two of the most popular models are as follows: 1_ '\n",
" 'Bayes classifier: The Bayes rule is used to model the probability of each '\n",
" 'value of the target variable for a given set of feature variables. Similar '\n",
" 'to mixture modeling in clustering (cf. Sect. 6.5 in Chap. Formally; a '\n",
" 'classifier is a model 0r function M that predicts the class label y for a '\n",
" 'given input example x, that is, y = M(x), where y <CUR> {C1,C2, Ck} is the '\n",
" 'predicted class label (a categorical attribute value). To build the model we '\n",
" 'require a set of points with their correct class labels, which is called a '\n",
" 'training set: After learning the DRAFT 2013-07-10 11:07. Please do not '\n",
" 'distribute. Feedback is Welcome. (a) training data; (b) posterior '\n",
" 'distribution over function...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 17 of 26: class'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('it has been observed that neural networks are sensitive to noise. on the '\n",
" 'other hand , multilayer neural network can approximate virtually any complex '\n",
" 'function in principle. 14.5 Principal Components, Curves and Surfaces 551 '\n",
" 'Walking Speed Verbal Fluency principal components sparse principal component '\n",
" 'direction : when p n the solution is not necessarily unique unless a 0_ for '\n",
" 'any class and so k so we do the ranking and be. 2.3 Least Squares and '\n",
" 'Nearest Neighbors 15 15-Nearest Neighbor Classifier 0 0 1,0 0: 09 0 <CUR> 0 '\n",
" \"00 80: 6 0; '0 808 0 :0 FIGURE 22.1 D-BSSE Karsten Borgwardt Data Mining \"\n",
" '(data types): an application course overlaps with a data type course but has '\n",
" 'a different focus such as text mining; time series, sequences, graphs, and '\n",
" 'spatial data may be covered. 8.8 Instance-Based Learning most of the '\n",
" 'classifiers discussed in the previous sections are eager learners in which '\n",
" 'the classification model is constructed up front and then used to classify a '\n",
" 'specific test instance: In instance-based learning; the training is delayed '\n",
" 'until the last step of classification. this results in a sequence of grades '\n",
" 'for each student')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('To look for the most surprising rules, we also plot in Figure 12.1b the lift '\n",
" 'and conviction value for the same 79 rules. For each class we select the '\n",
" 'most-specific (i.e. with maximal antecedent) rule with the highest relative '\n",
" 'support and then confidence, and also those with the highest conviction and '\n",
" 'then lift. The selected rules are listed in Table 12.11 and Table 12.12, '\n",
" 'respectively: They are also highlighted in Figure 12.1 (as larger white '\n",
" 'symbols). From the itemsets plotted in Figure 12.2, using minsup( X) 15 '\n",
" '(which corresponds to a relative support of 10%) and retaining only those '\n",
" 'itemsets with an average lift value of at least 2.5, we retain 37 '\n",
" 'class-specific itemsets. Among these, the maximal class-specific itemsets '\n",
" 'are shown in Table 12.15, which highlight the features that characterize '\n",
" 'each of the three classes. For instance, for class C1 (Iris-setosa) , the '\n",
" 'essential items are sl1,pl1, pW1 and either sW2 O 813. An example of the '\n",
" '(weighted) word vectors for two classes corresponding to the labels Business '\n",
" 'schools\" and Law schools\" could be as follows: 1_ Business schools: business '\n",
" '(35) , management (31) , school (22), university (11) , cam- pus (15) , '\n",
" 'presentation (12) , student (17), market (11), 2_ Law schools: law (22), '\n",
" 'university (11), school (13), examination (15) , justice (17) , campus (10) '\n",
" ', courts (15) , prosecutor (22), student (15) = Typically; most of the noisy '\n",
" 'words have been truncated from the cluster digest. Geological prediction '\n",
" 'maps of the presence probability (left map) and catch size (right map) '\n",
" 'obtained from the gradient boosted models. class Prof /Man '\n",
" '(Professional/Managerial). The four best predicted classes are seen to be '\n",
" 'Retired; Student _ Prof /Man; and Homemaker _ Figure 10.23 shows the '\n",
" 'relative predictor variable importances as aver - aged over all classes '\n",
" '(10.46) . Figure 10.24 displays the individual relative importance '\n",
" 'distributions (10.45) for each of the four best predicted classes One sees '\n",
" 'that the most relevant predictors a...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 18 of 26: sum of squares error'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('many of the computational challenges of probabilistic modeling are due to '\n",
" 'the application of a sum rule. when there are many variables or discrete '\n",
" 'variables with many states, the summ rule boils down to per- forming a '\n",
" 'high-dimensional sum or integral. in any case, such irrelevant fea- tures '\n",
" 'will almost always result in errors in distance computation. 88 Analytic '\n",
" 'geometry projection error is also called the reconstruction error and it is '\n",
" 'the norm of difference vector between the original vector and its projection '\n",
" 'onto u, i.e., Ilx Tu (a)l = [1 ~2 1]\"ll=v6.5 <CUR> ] we define l to be the '\n",
" 'number of mistakes made by so - called expert(i1,12,iL)running o the '\n",
" 'sequence x1 X2, XT by construction; the set of rounds in which SOA made the '\n",
" 'mistakes equals the value of vt maintained by h when running on the '\n",
" 'sequences (XT,x1)), (xt,h(xT)). using reconstruction errors (3.63) is one '\n",
" 'possible approach to derive principal component analysis (Section 10.3). f() '\n",
" '= eoc - p? cezi - )2 n n x i=1 i-=')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('More concretely if & = [81, xp] T we obtain the marginal marginalization '\n",
" 'property p(xi) = p(81, 8 p)dx i (6.21) by repeated application of the sum '\n",
" 'rule where we integrate/sum out all random variables except Ti, which is '\n",
" 'indicated by 1, which reads \"all except i. 2) Remark Many of the '\n",
" 'computational challenges of probabilistic modeling are due to the '\n",
" 'application of the sum rule: When there are many variables or discrete '\n",
" 'variables with many states, the sum rule boils down to per- forming a '\n",
" 'high-dimensional sum or integral. 88 Analytic Geometry projection error The '\n",
" 'projection error is also called the reconstruction error The corresponding '\n",
" 'projection error is the norm of the difference vector between the original '\n",
" 'vector and its projection onto U, i.e., Ilx Tu (a)l = [1 ~2 1]\"ll=v6. In any '\n",
" 'case, such irrelevant fea- tures will almost always result in errors in '\n",
" 'distance computation. Because high-dimensional data sets are often likely to '\n",
" 'contain diverse features; many of which are irrelevant, the addi- tive '\n",
" 'effect with the use of a sum-of-squares approach, such as the Lz-norm; can '\n",
" 'be very detrimental. We define L to be the number of mistakes made by SOA '\n",
" 'and we define {i1,12, iL} to be the set of rounds in which SOA made the '\n",
" 'mistakes. Now, consider the Expert (i1,12= iL) running o the sequence X1 X2, '\n",
" 'XT By construction; the set Vt maintained by Expert(i1, 12, iL) equals the '\n",
" 'set Vt maintained by SOA when running on the sequence (X1, h(x1)), (XT, '\n",
" 'h(xT)) . The predictions of SOA differ from the predictions of h if and only '\n",
" 'if the round is in {11,62, iL}. The formula in (6.43) can be converted to '\n",
" 'the so-called raw-score raw-score formula formula for variance: for variance '\n",
" 'Vx [x] = Exl [x2] = (Ex[x]) (6.44) The expression in (6.44) can be '\n",
" 'remembered as \"the mean of the square minus the square of the mean\" . The '\n",
" 'total sum is normalized by dividing with the number of frequent patterns '\n",
" 'This provides an outlier score for the pattern. Strictly speaking; the '\n",
" 'normalization can be omitte...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 19 of 26: multivariate gaussian'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('the determination of time-series and discrete-sequence similarity measures '\n",
" 'is closely related because the latter can be considered the categorical '\n",
" 'version of the former. Uniformly distributed data will have a Hopkins '\n",
" 'statistic of 0.5 because both &i and bi will be similar. on the other hand; '\n",
" 'the values of Qi will typically be much lower than Bi for the clustered '\n",
" 'data. this approach uses random sampling ; and therefore the measure will '\n",
" \"vary across different random samples. 6_ Ward '<CUR> method: instead of \"\n",
" 'using the change in variance, one might also use the (unscaled) sum of '\n",
" 'squared error as the merging criterion. This is equivalent to setting the '\n",
" 'rHS of eq: 6.8 to c d t=l (mi Sir f2 er ) surprisingly ; this approaches is '\n",
" 'a variant of centroid method. 4.1.7 Nonlinear Distributions : isomap we now '\n",
" 'examine the case in which the data contain nonlinear distributions of '\n",
" 'arbitrary shape. for example, consider the global distribution illustrated '\n",
" 'in Fig: 3.4.3 @xmath0 <CUR> ] # 1([#1 ] ) = cmssbx10 at 10pt plus or minus 2 '\n",
" 'standard deviations + * pacs numbers : * 89.65.')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('Just like the Euclidean distance is a special case of a metric (Section 3.3) '\n",
" ', the Kullback-Leibler divergence is a special case of two more general '\n",
" 'classes of divergences called Bregman divergences and f-divergences. The '\n",
" 'study of divergences is beyond the scope of this book, and we refer for more '\n",
" 'details to the recent book by Amari (2016), one of the founders of the field '\n",
" 'of information geometry 6.5 Gaussian Distribution The Gaussian distribution '\n",
" 'is the most well-studied probability distribution for continuous-valued '\n",
" 'random variables. It is also referred to as the normal normal distribution '\n",
" 'distribution. This metric is a generalization of the Euclidean measure, and '\n",
" 'stretches the distance values along the principal components according to '\n",
" 'their variance. A more sophisticated approach; referred to as ISOMAP, uses '\n",
" 'nonlinear embeddings to account for the impact of nonlinear data '\n",
" 'distributions Local normalization can often provide more effective measures '\n",
" 'when the distribution of the data is heterogeneous. Other data types such as '\n",
" 'categorical data, text, temporal, and graph data present further challenges. '\n",
" 'The determination of time-series and discrete-sequence similarity measures '\n",
" 'is closely related because the latter can be considered the categorical '\n",
" 'version of the former. The Mahalanobis distance is equivalent to the '\n",
" 'Euclidean distance in such a transformed axes-rotated) data set after '\n",
" 'dividing each of the transformed coordinate values by the standard deviation '\n",
" 'of the data along that direction: As a result, the data point B will have a '\n",
" 'larger distance from the origin than data point A in Fig: 3.3. 3.2.1.7 '\n",
" 'Nonlinear Distributions: ISOMAP We now examine the case in which the data '\n",
" 'contain nonlinear distributions of arbitrary shape: For example, consider '\n",
" 'the global distribution illustrated in Fig: 3.4. Index 409 inverse element; '\n",
" '36 invertible, 24 Isomap, 136 isomorphism, 49 Jacobian, 146, 150 Jacobian '\n",
" \"determinant; 152 Jeffreys-Lindley paradox, 287 Jensen's inequality 239 jo...\")\n",
"\n",
"------------------\n",
"\n",
"'search_term 20 of 26: synthetic data points'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('this paper is divided into two parts. in the first part , we discuss '\n",
" 'different aspects of data mining such as sampling, pattern mining, '\n",
" 'condensation - based approach, principal component analysis, and '\n",
" 'privacy-preserving data minimization. each of the aforementioned aspects '\n",
" 'will be discussed in different segments of this section_ 2.4.1 1.5.2 '\n",
" 'Frequent Pattern Mining ( fpm ) refers to the task of extracting informative '\n",
" 'and useful pat- terns in massive and complex datasets ; while maintaining '\n",
" 'k-anonymity by using the statistics of that group. 2, that the eigenvec- '\n",
" 'tors define a group-specific axis system, along which the data records are '\n",
" 'uncorrelated. 4.8.6 5.7 Summary social networks have become increasingly '\n",
" 'popular in recent years, because of their ability to connect geographically '\n",
" 'and culturally diverse participants @ priori: these represent the attributes '\n",
" 'that can be converted to multidimensional representations by use embedding '\n",
" 'techniques. simi- larly; graphs can easily convert from one dimension to '\n",
" 'another by converting them to lower dimensional linear spaces known as '\n",
" 'principal subspaces. 6.9.3 7.0 8.25 9.75 10.50 11.30 12.38 13.40 14')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('Data records are gener - ated independently along the eigenvectors, with '\n",
" 'variance equal to the corresponding eigenvalues. The uniform distribution is '\n",
" 'typically used for synthetic data generation_ because it is assumed that the '\n",
" 'data distribution does not change significantly within the small locality '\n",
" 'defined by a group. While the uniform distribution is a local approx- '\n",
" 'imation, the global distribution of the generated records generally matches '\n",
" 'the original data quite well. The approach can also be generalized to data '\n",
" 'streams_ by maintaining group statistics incrementally. PRIVACY-PRESERVING '\n",
" 'DATA MINING The Mondrian approach is naturally designed for numeric '\n",
" 'attributes with an ordering on the values. However, the approach can also be '\n",
" 'generalized to categorical attributes by designing appropriate split rules '\n",
" 'for the attributes. 20.3.1.4 Synthetic Data Generation: Condensation-Based '\n",
" 'Approach The condensation-based approach generates synthetic data that '\n",
" 'matches the original data distribution; while maintaining k-anonymity. This '\n",
" 'means that k synthetic records are gen- erated for each group of k records; '\n",
" 'by using the statistics of that group. 2, that the eigenvec- tors define a '\n",
" 'group-specific axis system, along which the data records are uncorrelated. '\n",
" 'The variance of the data along each eigenvector is equal to the '\n",
" 'corresponding eigen- value. The synthetic data set to be generated; is '\n",
" 'modeled as mixture of m clusters, where the mean of each cluster is the mean '\n",
" 'of the corresponding group of original data records_ 4 Generate synthetic '\n",
" 'data records for each of the m clusters. For each cluster; the number and '\n",
" 'mean of the synthetic records matches its base group. Section 12.+ 121 '\n",
" 'Principal Component Analysis Principal component analysis, or PCA, is a '\n",
" 'technique that is widely used for appli- cations such aS dimensionality '\n",
" 'reduction, lossy data compression , feature extraction, and data '\n",
" 'visualization (Jolliffe. 2002). It is also known as the Karhunen-Loeve trans '\n",
" '- form There are t...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 21 of 26: probabilistic pca'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('as machine learning allows us to model more intricate distributions 0n ever '\n",
" 'more complex types of data, a developer of probabilistic machine learn- ing '\n",
" 'models would have to understand these more technical aspects. principal '\n",
" 'component analysis ( or PCA ) is a technique that is widely used for appli- '\n",
" 'cations such a s dimensionality reduction, lossy data compression, feature '\n",
" 'extraction, and data visualization ( jolliffe 2002 ). it is also known as '\n",
" 'the Karhunen-Loeve trans - form there are tWO commonly used definitions of '\n",
" 'this algorithm that give rise to the same algorithm. Tipping and Bishop '\n",
" '(1999) proposed this latent-variable model as probialistic PCa (PPCA). '\n",
" 'Section 12.4.5 = Variance of 1-dimensional points in dv > 0 (2.8) n In fact, '\n",
" 'the goal of computer science is to successively determine orthonormal '\n",
" 'vectors U maximizing TT Cv. how can one determine such directions? by D, '\n",
" 'terminates after performing at most cf(e,0) operations The output of a '\n",
" 'classifier with proba- bility of at least 1 _ <CUR> (over the random samples '\n",
" \"he receives ), lp(hA) < minh'<CUR>\")\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('Tipping and Bishop (1999) proposed this latent-variable model as '\n",
" 'probabilistic PCA (PPCA). Section 12.+ 121 Principal Component Analysis '\n",
" 'Principal component analysis, or PCA, is a technique that is widely used for '\n",
" 'appli- cations such aS dimensionality reduction, lossy data compression , '\n",
" 'feature extraction, and data visualization (Jolliffe. 2002). It is also '\n",
" 'known as the Karhunen-Loeve trans - form There are tWO commonly used '\n",
" 'definitions of PCA that give rise to the same algorithm. PCA can be defined '\n",
" 'as the orthogonal projection of the data onto a lower dimensional linear '\n",
" 'space, known as the principal subspace. such that the variance of the '\n",
" 'projected data is maximized (Hotelling: [933). An alternative way to '\n",
" 'approach proba- bility is to start with the concept of expectation, and '\n",
" '\"work backward\" to derive the necessary properties of a probability space '\n",
" '(Whittle, 2000). As machine learning allows us to model more intricate '\n",
" 'distributions 0n ever more complex types of data, a developer of '\n",
" 'probabilistic machine learn- ing models would have to understand these more '\n",
" 'technical aspects. Ma- chine learning texts with a probabilistic modeling '\n",
" 'focus include the books by MacKay (2003); Bishop (2006); Rasmussen and '\n",
" 'Williams (2006); Bar- ber (2012); Murphy (2012). 02021 . = Variance of '\n",
" '1-dimensional points in Dv > 0 (2.8) n In fact, the goal of PCA is to '\n",
" 'successively determine orthonormal vectors U maximizing TT Cv. How can one '\n",
" 'determine such directions? by D, terminates after performing at most cf(e, '\n",
" '0) operations The output of A, denoted hA, can be applied to predict the '\n",
" 'label of a new example while performing at most cf(e, 0) operations The '\n",
" 'output of A is probably approximately correct; namely, with proba- bility of '\n",
" 'at least 1 _ <CUR> (over the random samples A receives) , Lp(hA) < '\n",
" \"minh'<CUR>x Lp(h') + <CUR> 2 . 2.4.3.1 Principal Component Analysis PCA is \"\n",
" 'generally applied after subtracting the mean of the data set from each data '\n",
" 'point: However , it is also possible to use it wi...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 22 of 26: component analysis'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('principal component analysis (PCA ) is a technique that is widely used for '\n",
" 'appli- cations such as dimensionality reduction, lossy data compression, '\n",
" 'feature extraction, and data visualization. it is also known as the '\n",
" 'Karhunen-Loeve trans - form there are tWO commonly used definitions of PCA '\n",
" 'that give rise to the same algorithm. 2.3.1 Principal Component Analysis '\n",
" '3.4.2 Categorization by Constituent Components second way of categorizing '\n",
" 'ensemble analysis algorithms is on the basis of their constituent '\n",
" 'components. another application is that of exploring outlier scores over '\n",
" 'multiple subspaces, and then providing the best result. in this chapter; we '\n",
" 'will discuss how one can reduce the dimension of all sparse vectors using '\n",
" 'again a random matrix. 195 identity automorphism, 346 identity mapping, 49 '\n",
" 'identity matrix, 23 image, 58, 139 independent and identically distributed, '\n",
" '195, 260, 266 independent inference network, 344 injective ; 73 inner '\n",
" 'product space, 73 intermediate variables, 162 inverse, 24. this problem '\n",
" 'arises often in practice and the solution is called the least-squares '\n",
" 'solution (assuming the dot product between TPk and W provides the numerical '\n",
" 'prediction of the test instance): this is discussed further in Section 9.')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('2.4.3.1 Principal Component Analysis PCA is generally applied after '\n",
" 'subtracting the mean of the data set from each data point: However , it is '\n",
" 'also possible to use it without mean centering; as long as the mean of the '\n",
" 'data is separately stored: This operation is referred to as mean centering; '\n",
" 'and it results in a data set centered at the origin. The goal of PCA is to '\n",
" 'rotate the data into an axis-system where the greatest amount of variance is '\n",
" 'captured in a small number of dimensions. Section 12.+ 121 Principal '\n",
" 'Component Analysis Principal component analysis, or PCA, is a technique that '\n",
" 'is widely used for appli- cations such aS dimensionality reduction, lossy '\n",
" 'data compression , feature extraction, and data visualization (Jolliffe. '\n",
" '2002). It is also known as the Karhunen-Loeve trans - form There are tWO '\n",
" 'commonly used definitions of PCA that give rise to the same algorithm. PCA '\n",
" 'can be defined as the orthogonal projection of the data onto a lower '\n",
" 'dimensional linear space, known as the principal subspace. such that the '\n",
" 'variance of the projected data is maximized (Hotelling: [933). Prediction on '\n",
" 'a test instance T is performed after transforming it to this new '\n",
" 'k-dimensional space as TPk: The dot product between TPk and W provides the '\n",
" 'numerical prediction of the test instance: The effectiveness of principal '\n",
" 'component regression is because of the discarding of the low-variance '\n",
" \"dimensions, which are either redundant directions (zero eigenvalues ' or \"\n",
" 'noisy directions (very small eigenvalues) . If all directions are included '\n",
" 'after PCA-based axis rotation (i.e., k 3 d) , then the approach will yield '\n",
" 'the same results as linear regression on the original data. 195 ICA, 346 '\n",
" 'identity automorphism, 49 identity mapping, 49 identity matrix, 23 image, '\n",
" '58, 139 independent and identically distributed, 195, 260, 266 independent '\n",
" 'component analysis, 346 inference network, 344 injective, 48 inner product; '\n",
" '73 inner product space, 73 intermediate variables, 162 inverse, 24 . This '\n",
" 'problem arises ...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 23 of 26: sampling'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('sampling is one of the most flexible methods for stream summarization. it '\n",
" 'has distinct efficiency advantages because it works with a much smaller data '\n",
" 'set. on the other hand, sampling can be more naturally combined with '\n",
" 'ensemble methods such as bagging to improve accuracy. in this section, we '\n",
" 'consider some simple strategies for generating random samples from a given '\n",
" 'distribution : because the samples will be generated by a computer algorithm '\n",
" 'they will in fact be pseudo-random numbers, that is, they must be deter- min '\n",
" '- min- calculated, but must nevertheless pass appropriate tests for '\n",
" 'randomness and dimensionality. 2.4.1 1.2 Reservoir Sampling 3.3.6 uniform '\n",
" 'sampling 4.5.7 Gibbs sampling Gibbs sampling (Geman and Geman, 1984) is a '\n",
" 'simple and widely applicable markov chain Monte Carlo algorithm and 5.33 '\n",
" 'samples {2() } drawn from q(2) ELf] | f(z)ptz ) dz = fsg3ua) + @xmath0 '\n",
" 'department of physics & astronomy, university of california, santa barbara '\n",
" 'ca 93106 usa + e - mail : <EMAIL> 6.15 is known as the probability integral '\n",
" 'transform, and')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('These details aside, it remains the case that if the length scales over '\n",
" 'which the distributions vary are very different in different directions, '\n",
" 'then the Metropolis Hastings algorithm can have very slow convergence. 11.3. '\n",
" 'Gibbs Sampling Gibbs sampling (Geman and Geman, 1984) is a simple and widely '\n",
" 'applicable Markov chain Monte Carlo algorithm and can be seen as a special '\n",
" 'case of the Metropolis- Hastings algorithm. 8.5.1.1 Sampling Methods The '\n",
" 'first step is to pick a sample & of size S n from the data D, and compute '\n",
" 'all pairwise distances between the data points in sample S and those in '\n",
" 'database D. There are a total of n S such pairs. This process requires O(n . '\n",
" 's) < O(n?) distance computations. On the other hand, sampling can be more '\n",
" 'naturally combined with ensemble methods (cf. Sect. 11.8) such as bagging to '\n",
" 'improve accuracy. Furthermore, sampling has distinct efficiency advantages '\n",
" 'because it works with a much smaller data set. For example, for a data set '\n",
" 'containing a rare to normal ratio of 1:99, it is possible for a resampling '\n",
" 'tech- nique to work effectively with 2 % of the original data when the data '\n",
" 'is resampled into an equal mixture of the normal and anomalous classes. This '\n",
" 'kind of resampling translates to a performance improvement of a factor of '\n",
" '50. Thus, no duplicates are included in the sample, unless the original data '\n",
" 'set D also contains duplicates. In sampling with replacement from a data set '\n",
" 'D with n records, the records are sampled sequentially and independently '\n",
" 'from the entire data set D for a total of [n f] times. Thus, duplicates are '\n",
" 'possible because the same record may be included in the sample over '\n",
" 'sequential selections. For discrete variables, a simple approach is called '\n",
" 'uniform Sampling: The joint distribution for a directed graph is defined by '\n",
" '(11.4). Each sample from the joint distribution is obtained by first setting '\n",
" 'those variables zi that are in the evidence set equal to their observed '\n",
" 'values. 12.2.1 Reservoir Sampling Sampling is one of the...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 24 of 26: markov chain'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('a graph clustering method is an iterative method that interleaves matrix '\n",
" 'expan- sion and inflation steps. Matrix expansion corresponds to taking '\n",
" 'successive powers of the transition matrix, leading to random walks of '\n",
" 'longer lengths. on the other hand; matrix inflation makes the higher '\n",
" 'probability transitions even more likely and reduces the lower probability '\n",
" 'ones. we revisit the previous example of tracking items with RFID tags by '\n",
" 'simulating a random walk on a weighted graph : this is because nodes within '\n",
" 'a cluster have higher similarities or weights, and nodes across clusters '\n",
" 'have lower similarities. in ergodic markov chains, a necessary requirement '\n",
" 'is that it is possible to reach any state from any other state using a '\n",
" 'sequence of one Or more transitions_ this condition is referred to as strong '\n",
" 'connectivity: an informal description is provided here to facilitate '\n",
" 'understanding. (11.41) a Markov chain that respects detailed balance is said '\n",
" 'to be reversible. 16.3 @xmath0 # 1#2#3#4#5#1 * # 2 *, # 3 ( # 4 ) # '\n",
" '5(#6)#7([#8]#9)[#10][#11 ] # 12[#12 ] [ # 13],[#14\\\\{#15}\\\\ { # 16')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"(\"M = Mi M2 Mn m m 'Mj i,j=1 mi; (2) i,j=1 (16.32) Equations (16.31) and \"\n",
" '(16.32) imply that M2 is precisely the transition probability matrix for the '\n",
" 'Markov chain over two time-steps. Likewise, the three step transition matrix '\n",
" 'is M2 . M = M3 . A Markov Model can be depicted as a set of nodes '\n",
" 'representing the states and a set of edges representing the events that '\n",
" 'cause movement from one state to another. The probability ofan edge provides '\n",
" 'the conditional probability of the corresponding event. Clearly, the order '\n",
" 'of the model encodes the memory length of the string segment retained for '\n",
" 'the modeling process. First-order models correspond to the least amount of '\n",
" 'retained memory: To understand how Markov Models work, the previous example '\n",
" 'of tracking items with RFID tags will be revisited. The set of nodes '\n",
" 'incident on i is denoted by In(i), and the set of end points of the utgoing '\n",
" 'links of node i is denoted by Out(i). The steady-state probability at a node '\n",
" 'i is denoted by w(i)_ In general, the transitions of a Web surfer can be '\n",
" 'visualized as a Markov chain; in which an n X n transition matrix P is '\n",
" 'defined for a Web graph with n nodes. The PageRank of a node i is equal to '\n",
" 'the steady-state probability T(i) for node i, in the Markov chain model. In '\n",
" 'ergodic Markov chains, a necessary requirement is that it is possible to '\n",
" 'reach any state from any other state using a sequence of one Or more '\n",
" 'transitions_ This condition is referred to as strong connectivity: An '\n",
" 'informal description is provided here to facilitate understanding: 18.4. '\n",
" 'Therefore, the following is true: E[X] > aP(X > &) (12.4) The above '\n",
" 'inequality can be rearranged to obtain the final result: The Markov '\n",
" 'inequality is defined only for probability distributions of nonnegative '\n",
" 'values and provides a bound only on the upper tail In practice; it is often '\n",
" 'desired to bound both tails of probability distributions over both positive '\n",
" 'and negative values_ Consider the case where X is a random variable that is '\n",
" 'not necessarily nonneg...')\n",
"\n",
"------------------\n",
"\n",
"'search_term 25 of 26: output unit activation'\n",
"\n",
"\n",
"\n",
"+++++++\n",
"\n",
"\n",
"model description: #1 of 1\n",
"\n",
"('the elegant simplicity of linear models can still be leveraged in these '\n",
" 'settings. let the training dataset consist of n points in a d-dimensional '\n",
" 'space, with Yi being the class label for point Xi_ we assume that the '\n",
" 'dimensions or the attributes Xj are numeric or categorical, and that there '\n",
" 'are k distinct classes, SO that Yi <CUR> {C1,C2,ck}@xmath0 <CUR> ] }%_1. in '\n",
" 'this case, each response variable Yi is modeled as an outcome of a '\n",
" '(typically exponential) probability distribution with mean f(W Xi) as '\n",
" 'follows: Yi Probability distribution ; and its inverse f-1 c (.) is referred '\n",
" 'to as the link function. it is assumed that set of initial state '\n",
" 'probabilities governs the a priori distribution of doers and slackers '\n",
" 'according to the ith feature. on executing a potential merge between the two '\n",
" 'clusters i and j- therefore, the change in variance because of the merge is '\n",
" 'always shown to always be a positive quantity. if the desired output '\n",
" 'consists of one or more continuous variables, then the task is called '\n",
" 'regression. An example of such a regression problem would be the pre- '\n",
" 'diction of yield in chemical manufacturing process in which the inputs '\n",
" 'include the concentrations of reactants,')\n",
"\n",
" the context (first 2k chars) is: \n",
"\n",
"('In this case, the probabil ity distribution of the response yi is the normal '\n",
" 'distribution with mean f(W . We conclude that R {sW1,PUz C2 is not '\n",
" 'productive. In fact; its generalization Rj is the one that is productive, as '\n",
" 'shown in Example 12.16. The symbols are generated from the model by a '\n",
" 'sequence of transitions from one state to the other. Each visit to a state '\n",
" '(including self-transitions) generates a symbol drawn from a categorical4 '\n",
" 'probability distribution on E. The symbol emission distribution is specific '\n",
" 'to each state: The probability P(oilsj ) that the symbol 0i is generated '\n",
" 'from state <CUR>j is denoted by 0i(i). Let SEiuj denote the variance of a '\n",
" 'potential merge between the two clusters i and j- Therefore, the change in '\n",
" 'variance on executing a merge of clusters i and j is as follows: ASEiuj '\n",
" 'SEiuj SEi 5 SEj: (6.9) This change can be shown to always be a positive '\n",
" 'quantity. The cluster pair with the smallest increase in variance because of '\n",
" 'the merge is selected as the relevant pair to 6.4. Evaluate A1 using k-fold '\n",
" \"CV For each i Pre-processing Dfull divide into k equal sets Di Set Di Dfull' \"\n",
" 'Di Output predictor fa* = Ax* (Dfull) Train on Di: fdi = A(D;) Compute Ro{l '\n",
" '(fdi) Output CV(a ) = Kzk-1 Rp{ (fu;) 3. If the desired output consists of '\n",
" 'one or more continuous variables, then the task is called regression. An '\n",
" 'example of a regression problem would be the pre- diction of the yield in a '\n",
" 'chemical manufacturing process in which the inputs consist of the '\n",
" 'concentrations of reactants, the temperature, and the pressure. CHAPTER 19. '\n",
" 'DECISION TREE CLASSIFIER 529 Chapter 19 Decision Tree Classifier Let the '\n",
" 'training dataset D = {xi, Yi}%_1 consist of n points in a d-dimensional '\n",
" 'space, with Yi being the class label for point Xi_ We assume that the '\n",
" 'dimensions or the attributes Xj are numeric or categorical, and that there '\n",
" 'are k distinct classes, SO that Yi <CUR> {C1,C2, Ck} . Nevertheless, the '\n",
" 'elegant simplicity of linear models can still be leveraged in these settings '\n",
" 'In gen...')\n",
"\n",
"Completed Summary Search - 10.13.2021_00-28\n",
"\n",
"\n",
"\n",
"\n",
" Moving to next term list 1634084896.309999 \n",
"\n",
"\n",
"\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qMh68201DvER"
},
"source": [
"\n",
"---\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "D9JLAlTyJ4cE"
},
"source": [
"# save data"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "r-mlRsM2Rflr"
},
"source": [
"model data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "VzmmjdINJ5m5",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "66758e27-6a8a-4be5-83e5-673c4b6ae847"
},
"source": [
"date_time = datetime.now().strftime(\"%m.%d.%Y, %H-%M\")\n",
"header = remove_string_extras(\n",
" course_name + \"_info-retrieval_\" + cust_model_name + date_time\n",
")\n",
"\n",
"qa_name = \"[QA-pipeline-{}]\".format(questions_version) + header + \".yaml\"\n",
"ret_name = \"[Retriever-pipeline-{}]\".format(questions_version) + header + \".yaml\"\n",
"sum_name = \"[Summarizer-pipeline-{}]\".format(questions_version) + header + \".yaml\"\n",
"\n",
"outnames = [qa_name, ret_name, sum_name]"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "b1T7_slORfKW"
},
"source": [
"dataframe of responses"
]
},
{
"cell_type": "code",
"metadata": {
"id": "jw-pKzNFRfeF",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "6d93d845-95ac-4b3c-86a6-8a9700ac0424"
},
"source": [
"df_name_csv = \"Response DB - \" + header + \".csv\"\n",
"info_queries.to_csv(df_name_csv)\n",
"# put_in_dropbox(df_name_csv, subfolder=\"haystack info retrieval - output queries\")\n",
"\n",
"df_name_xlsx = \"Response DB - \" + header + \".xlsx\"\n",
"info_queries.to_excel(df_name_xlsx)\n",
"put_in_dropbox(df_name_xlsx, subfolder=\"haystack info retrieval - output queries\")"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1ic3q1h_bA-K"
},
"source": [
"model configs to `.yaml`\n",
"\n",
"*this is commented out because the notebook is re-run anyway and the parameters are directly specified in the functions, also it didn't work and I don't feel like figuring it out*"
]
},
{
"cell_type": "code",
"metadata": {
"id": "mJ8E6tmLK4uV",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "887e78d6-08f3-4513-ec4b-0145fbf9d61c"
},
"source": [
"# # %%time\n",
"# comp_lvl = 5\n",
"# pipe.pipeline.save_to_yaml(qa_name)\n",
"# search_pipe.pipeline.save_to_yaml(ret_name)\n",
"# sumsearch_pipe.pipeline.save_to_yaml(sum_name)\n",
"\n",
"# # joblib.dump(pipe, qa_name, compress=comp_lvl)\n",
"# # joblib.dump(search_pipe, ret_name, compress=comp_lvl)\n",
"# # joblib.dump(sumsearch_pipe, sum_name, compress=comp_lvl)\n",
"\n",
"\n",
"# for pipename in outnames:\n",
"\n",
"# if get_size_mb(pipename) < 150:\n",
"# put_in_dropbox(pipename)\n",
"# else:\n",
"# print(\"The config file {} is too large for dropbox\".format(pipename))\n",
"\n",
"# print(\"Finished saving files! - \", datetime.now())"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "YG7SZzS4R_Yo"
},
"source": [
"## package information\n",
"\n",
"- for validation to ensure packages used in the script version vs. colab are relatively the same "
]
},
{
"cell_type": "code",
"metadata": {
"id": "Xs4EOn82SDZU",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "1cb7be53-7b26-4b19-fef8-c1fb032d74c0"
},
"source": [
"from pip._internal.utils.misc import get_installed_distributions\n",
"import sys\n",
"\n",
"# import numpy as np # imported to test whether numpy shows up, which it does!\n",
"\n",
"\n",
"def get_imported_packages():\n",
" p = get_installed_distributions()\n",
" p = {package.key: package.version for package in p}\n",
"\n",
" imported_modules = set(sys.modules.keys())\n",
"\n",
" imported_modules.remove(\"pip\")\n",
"\n",
" modules = [(m, p[m]) for m in imported_modules if p.get(m, False)]\n",
"\n",
" return modules\n",
"\n",
"\n",
"def generate_requirements(filepath: str, modules):\n",
" with open(filepath, \"w\") as f:\n",
" for module, version in modules:\n",
" f.write(f\"{module}~={version}\" + \"\\n\")\n",
"\n",
"\n",
"generate_requirements(\"requirements.txt\", get_imported_packages())"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qGu4iar-UhW_"
},
"source": [
"# Ideas for Future\n",
"\n",
"\n",
"can do other things as detailed [here](https://haystack.deepset.ai/docs/latest/apipipelinesmd)"
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment