pszemraj/AutoTA-Example.md

## AutoTA-Example.md

      
    Raw
  

              AutoTA-Example.md
            
          
    AutoTA Example


A series of transformer models that embed your course documents, search for them when you ask it a question, and construct a reply in coherent english using a text2text generator.
Based on the work done by deepset.ai in their Haystack library.
As the notebook preview does not always load on GitHub, click here for the Colab notebook / demo. Instructions are provided on what to edit to use the notebook both in simple cases (i.e. just changing the questions) to adapting the notebook for other purposes.

Example: The model's explanation of K-Means

question to model: What is the k-means algorithm?
model response:
("'the k means algorithm is one of the most popular iterative descent clustering methods intended for situations in which all variables are of the quantitative type, and that for each such assignment there is a unique optimum for the k pk, the algorithm must converge after a finite number of iterations. it is not clear that the hard assignment to the nearest cluster is the most appropriate. since the assignment of observations to clusters at any iteration is a perturbation of that for the previous iteration, only a very small fraction of all possible assignments. however, these algorithms converge to local optima which may be highly suboptimal when compared to the global optimum. in the present invention, the convergence properties of these algorithms are studied and it is shown that if the algorithm converges to a local maximum of l, this will correspond to a global maximum of the log likelihood function; and so in this incremental algorithm is increasing the value of l and if the result no longer changes, a local optimum has been reached.'")


## haystack-course-info-extractor-iml-s21.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "name": "HayStack Course Info Extractor - IML s21.ipynb",
      "provenance": [],
      "collapsed_sections": [
        "d3yqOW7zbKHJ",
        "Wmd3HbeHd1JP",
        "Y_t5fJLMiii2",
        "kHpqZ9SDefFL",
        "YG7SZzS4R_Yo"
      ],
      "machine_shape": "hm",
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "90d338fc12f24a0f9f2c2cb4fc07a555": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_768cd79789a44724872bb6f315f01667",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_fa5ecb3f7c244f32ae847714cb212923",
              "IPY_MODEL_9cf3d45a7395476685748dc565f42dfe",
              "IPY_MODEL_eb7b79ea89f64e2c890d3e0dcb60e0de"
            ]
          }
        },
        "768cd79789a44724872bb6f315f01667": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "fa5ecb3f7c244f32ae847714cb212923": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_81097802618746b399b2788a14635853",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Updating embeddings: ",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_d90f3d0362c54b61a76919bd2c85bc2d"
          }
        },
        "9cf3d45a7395476685748dc565f42dfe": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_4386721c9689477cb52f10f82e3efb97",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 34525,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 34525,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_99d8e8d3231b4e99a983b8bf7bc7a5e2"
          }
        },
        "eb7b79ea89f64e2c890d3e0dcb60e0de": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_6eb390f7983b4c9384156d2a48d2f3ac",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 40000/? [09:04&lt;00:00, 80.84 Docs/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_b463b88099b145d9b8c90c0c361411f2"
          }
        },
        "81097802618746b399b2788a14635853": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "d90f3d0362c54b61a76919bd2c85bc2d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "4386721c9689477cb52f10f82e3efb97": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "99d8e8d3231b4e99a983b8bf7bc7a5e2": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "6eb390f7983b4c9384156d2a48d2f3ac": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "b463b88099b145d9b8c90c0c361411f2": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "49d9beb622bc4083bda44d153cde09d5": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_9bc7d217fd494f3babd6b1f559dcd0e0",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_64c09b2062be42daab3e4642f5377ac2",
              "IPY_MODEL_0f8c70e244db410f926a06ef57322375",
              "IPY_MODEL_4e7f0a5a083f4ac78ece5e310b0fb98f"
            ]
          }
        },
        "9bc7d217fd494f3babd6b1f559dcd0e0": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "64c09b2062be42daab3e4642f5377ac2": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_848c259c189d4308a23b3de278c544e3",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Create embeddings: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_14b280c4f7f142f0976148b8f95d549b"
          }
        },
        "0f8c70e244db410f926a06ef57322375": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_7694f4df92b2440a99acce949ce24d49",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "",
            "max": 10000,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 10000,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_911f175505c14d3d9d0cd82285f68d37"
          }
        },
        "4e7f0a5a083f4ac78ece5e310b0fb98f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_8f10292e23f049b68fe51a8d4b343e67",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 10000/10000 [01:23&lt;00:00, 119.80 Docs/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_2fa250bbb6f5473c91570ca5ce4ab553"
          }
        },
        "848c259c189d4308a23b3de278c544e3": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "14b280c4f7f142f0976148b8f95d549b": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "7694f4df92b2440a99acce949ce24d49": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "911f175505c14d3d9d0cd82285f68d37": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "8f10292e23f049b68fe51a8d4b343e67": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "2fa250bbb6f5473c91570ca5ce4ab553": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "a1c248f9f3854ab996d683acd10db563": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_c01ef5167c704b719cdcbc9455265f20",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_d7ba15dedcbb475484fee6814351bff9",
              "IPY_MODEL_71a1bbc7dad44bc8a296127e0da6ea64",
              "IPY_MODEL_108c208362a44b35a78d89f41a9f2649"
            ]
          }
        },
        "c01ef5167c704b719cdcbc9455265f20": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "d7ba15dedcbb475484fee6814351bff9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_4bf352be47dd45b2ad0465202950c009",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Create embeddings: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_64762b258c10418d8665a047481890a0"
          }
        },
        "71a1bbc7dad44bc8a296127e0da6ea64": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_5a8089e9b8d8447f8d6a72fe46b90d22",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "",
            "max": 10000,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 10000,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_82d26a176bcb4719b89b39ceb6040b3e"
          }
        },
        "108c208362a44b35a78d89f41a9f2649": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_4d0ee98d72984214b05e0e9c47865ce6",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 10000/10000 [01:23&lt;00:00, 119.60 Docs/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_bd91aaafdfde46b5a7357ab5b206a6e2"
          }
        },
        "4bf352be47dd45b2ad0465202950c009": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "64762b258c10418d8665a047481890a0": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "5a8089e9b8d8447f8d6a72fe46b90d22": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "82d26a176bcb4719b89b39ceb6040b3e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "4d0ee98d72984214b05e0e9c47865ce6": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "bd91aaafdfde46b5a7357ab5b206a6e2": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "9438931fa26c4e558d9149a468832438": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_a3907edd4a7a49edb7cda384b82d94c1",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_8c81af657f384ec5af715b3c0af2c16b",
              "IPY_MODEL_69eec051a1764886bd80fb85d9afc869",
              "IPY_MODEL_c31ed6dcf1f345ba8884545962a07ec9"
            ]
          }
        },
        "a3907edd4a7a49edb7cda384b82d94c1": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "8c81af657f384ec5af715b3c0af2c16b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_27e530f9b7314479b59a4cf96a19a948",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Create embeddings: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_229af4dc1b4e44acaa2f338714a72db3"
          }
        },
        "69eec051a1764886bd80fb85d9afc869": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_99acdcd0832d41f38de3200dfb94d95f",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "",
            "max": 10000,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 10000,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_29284b0cc2e94c058ef9b894af3e8232"
          }
        },
        "c31ed6dcf1f345ba8884545962a07ec9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_96df455b6eec4550b6e905f433c83c69",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 10000/10000 [01:23&lt;00:00, 119.75 Docs/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_27c4d59ecce846feb592bc4f10ecdc7a"
          }
        },
        "27e530f9b7314479b59a4cf96a19a948": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "229af4dc1b4e44acaa2f338714a72db3": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "99acdcd0832d41f38de3200dfb94d95f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "29284b0cc2e94c058ef9b894af3e8232": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "96df455b6eec4550b6e905f433c83c69": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "27c4d59ecce846feb592bc4f10ecdc7a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "f4f95246aab9486197b32ebe4e23134b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_dac43d8a6dca4d7998e6b8a7786d9fb3",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_3419519ef95d48ccb1593b747e5c69af",
              "IPY_MODEL_7879d3794894430c8ebcedfd41d78ead",
              "IPY_MODEL_53459fe2ff9447b49cd20d792cde9430"
            ]
          }
        },
        "dac43d8a6dca4d7998e6b8a7786d9fb3": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "3419519ef95d48ccb1593b747e5c69af": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_8e5470e49940433aa5215af177064a20",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Create embeddings: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_b7e9710de3804072afea61278abccf1c"
          }
        },
        "7879d3794894430c8ebcedfd41d78ead": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_76bee247b0c44a12b3e7eeaaa574e1a7",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "",
            "max": 4528,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 4528,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_0a6b832a2dc84e6ea144282eeb0d1efd"
          }
        },
        "53459fe2ff9447b49cd20d792cde9430": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_512b5d6622714c6b987152a3e853775d",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 4528/4528 [00:37&lt;00:00, 125.89 Docs/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_0a61f786b4764064b9d5d83d6966c8eb"
          }
        },
        "8e5470e49940433aa5215af177064a20": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "b7e9710de3804072afea61278abccf1c": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "76bee247b0c44a12b3e7eeaaa574e1a7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "0a6b832a2dc84e6ea144282eeb0d1efd": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "512b5d6622714c6b987152a3e853775d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "0a61f786b4764064b9d5d83d6966c8eb": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "cadfba2ad49c4491bebfeea46cdbfd04": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_0a7d88f48b844e57883a13932950e40a",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_d1010b43f7c7492fb43b27d42dc969b8",
              "IPY_MODEL_ecb0228894774a558a532d121c1ab5f1",
              "IPY_MODEL_06773afd7dec4dbf97bb3ed9a6625d90"
            ]
          }
        },
        "0a7d88f48b844e57883a13932950e40a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "d1010b43f7c7492fb43b27d42dc969b8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_37c0072536cb4b72b01c63f97af2ed6a",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "answering questions...: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_ccebaabfa49542728426f5cc832b8f32"
          }
        },
        "ecb0228894774a558a532d121c1ab5f1": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_5899fb0c71bb412b9c6e720f6025ed8d",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 16,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 16,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_efc60ece34b64c7883fada81364b732b"
          }
        },
        "06773afd7dec4dbf97bb3ed9a6625d90": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_eabf44e30c384f0ba67d15daf9e6a058",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 16/16 [04:14&lt;00:00, 15.67s/it]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_070cf267930d4ad5b37ebedaedefa1fd"
          }
        },
        "37c0072536cb4b72b01c63f97af2ed6a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "ccebaabfa49542728426f5cc832b8f32": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "5899fb0c71bb412b9c6e720f6025ed8d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "efc60ece34b64c7883fada81364b732b": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "eabf44e30c384f0ba67d15daf9e6a058": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "070cf267930d4ad5b37ebedaedefa1fd": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "c99dab409dc749d19586ba4ad916926c": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_9ba51990f0fb4ef2810109bfb7b3be45",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_325282267da54dcbb427b075a9f26506",
              "IPY_MODEL_fbd0f39d95c949a7869931832f0bab3f",
              "IPY_MODEL_2580299a6032432ba4cdfc864dd4d4c3"
            ]
          }
        },
        "9ba51990f0fb4ef2810109bfb7b3be45": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "325282267da54dcbb427b075a9f26506": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_f43384a4565541738cfc1525420c6cfc",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Downloading: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_2bd3944dce784d4ca6aeead28c29735d"
          }
        },
        "fbd0f39d95c949a7869931832f0bab3f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_83334a864ba448049fc5db60a045cb32",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 1291,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 1291,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_11fcaee382324c56965ab0ff0e01e029"
          }
        },
        "2580299a6032432ba4cdfc864dd4d4c3": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_3c8ca6b17c724ec3b011992a62950506",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 1.29k/1.29k [00:00&lt;00:00, 43.3kB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_40e53a738fef436a8bc6a50aed072c88"
          }
        },
        "f43384a4565541738cfc1525420c6cfc": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "2bd3944dce784d4ca6aeead28c29735d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "83334a864ba448049fc5db60a045cb32": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "11fcaee382324c56965ab0ff0e01e029": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "3c8ca6b17c724ec3b011992a62950506": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "40e53a738fef436a8bc6a50aed072c88": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "f20da18f698148ab882eb91acd94e674": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_23781c0c818341d9862bbca739e0331a",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_02b5d04ceee64b088100fc831341775d",
              "IPY_MODEL_a345cf60f8e54ef7a60f98a8a5b014e8",
              "IPY_MODEL_384a057a20eb49269a840d21cc14b283"
            ]
          }
        },
        "23781c0c818341d9862bbca739e0331a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "02b5d04ceee64b088100fc831341775d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_f0a759afe79245a295ae1fb606fd73cb",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Downloading: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_827936436d434bc9a7f530fcc7822248"
          }
        },
        "a345cf60f8e54ef7a60f98a8a5b014e8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_e34fbfe2349e42af9b8b3562df245387",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 1839633783,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 1839633783,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_3ceb4427b953473bb04daa6e937d27c8"
          }
        },
        "384a057a20eb49269a840d21cc14b283": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_0d814d6976144172a94bb2eef832f47b",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 1.84G/1.84G [00:50&lt;00:00, 36.1MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_630ef2da281440e9b26e60ff5d2cc501"
          }
        },
        "f0a759afe79245a295ae1fb606fd73cb": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "827936436d434bc9a7f530fcc7822248": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "e34fbfe2349e42af9b8b3562df245387": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "3ceb4427b953473bb04daa6e937d27c8": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "0d814d6976144172a94bb2eef832f47b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "630ef2da281440e9b26e60ff5d2cc501": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "e0cab22ea8614f858bba3157f5284929": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_f3adb2e4e92941b5a63914f733de52c8",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_344b107cd6004274945f5e9fb2db8d37",
              "IPY_MODEL_9299178358ed4ae580fd75fe27e10eeb",
              "IPY_MODEL_5d073a3773e64b7b8153281784ab7d4a"
            ]
          }
        },
        "f3adb2e4e92941b5a63914f733de52c8": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "344b107cd6004274945f5e9fb2db8d37": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_79ada5619ae549dc886074c5d0b079ba",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "getting defs for search_terms...: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_1d62d04c488a4aceb3597550ccd606fc"
          }
        },
        "9299178358ed4ae580fd75fe27e10eeb": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_9e4c97eb18c44e7cad98f19be5722bf1",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 16,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 16,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_ed4425bde1b74eb2972526e0d00fdadf"
          }
        },
        "5d073a3773e64b7b8153281784ab7d4a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_59e7ff0a4102444c858235995750a252",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 16/16 [02:22&lt;00:00,  8.59s/it]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_4861e090d5124fe7a19e9d0ccd7e669e"
          }
        },
        "79ada5619ae549dc886074c5d0b079ba": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "1d62d04c488a4aceb3597550ccd606fc": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "9e4c97eb18c44e7cad98f19be5722bf1": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "ed4425bde1b74eb2972526e0d00fdadf": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "59e7ff0a4102444c858235995750a252": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "4861e090d5124fe7a19e9d0ccd7e669e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "6d1cccc713dd438597db2af871b2f5ba": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_7f30902493d549ba8cb0bdbc3df13509",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_912bb412662b4c669aef4be773d94112",
              "IPY_MODEL_89eded059d964f0e998777a81acfc6c4",
              "IPY_MODEL_db3fb38ca1fd416f97f04cb06c50e62a"
            ]
          }
        },
        "7f30902493d549ba8cb0bdbc3df13509": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "912bb412662b4c669aef4be773d94112": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_18cad2a242a34155a4fc532f38d47e0d",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "getting defs for search_terms...:   0%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_c0bf713a94ac4bd29e7eb882d7f84124"
          }
        },
        "89eded059d964f0e998777a81acfc6c4": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_a6726f99abbb41d89536dfdc4a29e3df",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "",
            "max": 9,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 0,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_7a098fc570b246f7beaf773dfcd53eae"
          }
        },
        "db3fb38ca1fd416f97f04cb06c50e62a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_ad52d8079b294934a92e9001408de400",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 0/9 [00:00&lt;?, ?it/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_04c44f25fe814601804e5bfd0a3c144d"
          }
        },
        "18cad2a242a34155a4fc532f38d47e0d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "c0bf713a94ac4bd29e7eb882d7f84124": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "a6726f99abbb41d89536dfdc4a29e3df": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "7a098fc570b246f7beaf773dfcd53eae": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "ad52d8079b294934a92e9001408de400": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "04c44f25fe814601804e5bfd0a3c144d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/pszemraj/16f2ca3e0683877e6fc0b8cba3c613f4/haystack-course-info-extractor-iml-s21.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bEH-CRbeA6NU"
      },
      "source": [
        "# <center> Q&A with your Course Docs with the HayStack library </center>\n",
        "--- \n",
        "- long form question answering (generative LFQA) from a corpus of text documents. Think of it as an *Auto-TA*. \n",
        "    - text docs from **IML 2021** are included in this example. \n",
        "    - for a demo, just run all cells in this notebook. \n",
        "- the most relevant sections are at the bottom: **Question-Answer Pipeline & Summarized Document Search** \n",
        "- instructions are included throughout notebook \n",
        "    - adjust hyperparameters tagged with `decrease_if_crash ` if it crashes \n",
        "--- \n",
        "![kmeans](https://www.dropbox.com/s/9afgpp0kt4zwh01/kmeans%20def.png?dl=1)\n",
        "\n",
        "---"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "cellView": "form",
        "id": "1UZobc1vJ_Vt"
      },
      "source": [
        "course_name = \"intro to ML\" #@param {type:\"string\"}\n"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "uPQ6IBiDeT_i"
      },
      "source": [
        "## Purpose\n",
        "\n",
        "uses *Haystack* to perform analysis on a corpus of documents all from one class.\n",
        "\n",
        "#### References\n",
        "- A [tutorial](https://towardsdatascience.com/ask-wikipedia-eli5-like-questions-using-long-form-question-answering-on-haystack-32cf1ca6c00e) from medium\n",
        "- [haystack github](https://github.com/deepset-ai/haystack)\n",
        "- [haystack api ref](https://haystack.deepset.ai/docs/latest/apiretrievermd)\n",
        "\n",
        "---\n",
        "\n",
        "### CUDA crash / troubleshooting \n",
        "- <font color=\"salmon\"> info retrieval memory usage varies widely depending on the number of documents and associated hyperparameters for the models (there are two) and how the given models interact with each other (somewhat unpredictable)\n",
        "    - on top of that, the GPU Colab assigns the runtime is pseudo-random and obviously memory depends on the hardware\n",
        "    - if you run `!nvidia-smi` (see below) and your assigned GPU is not 16 gb, you almost definitely will need to change some things\n",
        "- running into a couple CUDA issues while figuring out what works is normal. If it crashes, decrease some key hyperparameters or change the model type used, and try again\n",
        "- hyperparameters that are recommended to be adjusted first have been tagged with the comment `decrease_if_crash` in this document. You can search for this tag with the \"find\" functionality (I think the standard shortcut is `CTRL+H` in Colab)\n",
        "    - typically the very first thing to change should be `n_beams` or `number_beam_search`\n",
        "- if I need to take a look at it and the above things have obviously not been investigated.. 😤</font>\n",
        "\n",
        "---\n",
        "\n",
        "### Adapting this notebook\n",
        "\n",
        "This concept works for other classes, or just _fields of information that have documents_ in general. Here's how:\n",
        "\n",
        "1. main pre-requisite is that you have all relevant documents as .txt zipped together.\n",
        "    - convert audio to text using [vid2cleantxt](https://github.com/pszemraj/vid2cleantxt) or other, PDFs/images can be converted via [OCR](https://github.com/JaidedAI/EasyOCR)\n",
        "2. the library has not been tested with PDF (by me, at least) but in theory should work - review haystack API docs *linked above* for what they have on PDF\n",
        "3. upload your document online somewhere that the .zip can be retried by the `requests` library and then update `URL_to_archive`\n",
        "4. update questions as needed at the bottom of this notebook\n",
        "5. update the models as needed once more comfortable with haystack API (for example, can use text gen trained on ELI5)\n",
        "\n",
        "\n",
        "*by [Peter Szemraj](https://github.com/pszemraj)*\n",
        "\n",
        "---"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JlZgP8q1A6NW",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "0c853a09-7ec5-4a98-e4c8-a63a388e08d1"
      },
      "source": [
        "# Make sure you have a GPU running that is 12 gb or greater\n",
        "!nvidia-smi\n",
        "# Runtime -> Change Runtime Type -> GPU + High-Ram if it lets you"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Thu Aug 19 22:43:30 2021       \n",
            "+-----------------------------------------------------------------------------+\n",
            "| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
            "|-------------------------------+----------------------+----------------------+\n",
            "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
            "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
            "|                               |                      |               MIG M. |\n",
            "|===============================+======================+======================|\n",
            "|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |\n",
            "| N/A   58C    P0    40W / 250W |      0MiB / 16280MiB |      0%      Default |\n",
            "|                               |                      |                  N/A |\n",
            "+-------------------------------+----------------------+----------------------+\n",
            "                                                                               \n",
            "+-----------------------------------------------------------------------------+\n",
            "| Processes:                                                                  |\n",
            "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
            "|        ID   ID                                                   Usage      |\n",
            "|=============================================================================|\n",
            "|  No running processes found                                                 |\n",
            "+-----------------------------------------------------------------------------+\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wIVP-NAqNsn7"
      },
      "source": [
        "# setup"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "RnuIUHhCPulF"
      },
      "source": [
        "from IPython.display import HTML, display\n",
        "\n",
        "def set_css():\n",
        "  display(HTML('''\n",
        "  <style>\n",
        "    pre {\n",
        "        white-space: pre-wrap;\n",
        "    }\n",
        "  </style>\n",
        "  '''))\n",
        "get_ipython().events.register('pre_run_cell', set_css)"
      ],
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kkPbArxONrQu",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "9297c1dc-f3f8-46b4-8e3c-004a13575a58"
      },
      "source": [
        "%%capture\n",
        "# this apparently makes the docstore faster\n",
        "!git clone https://github.com/NVIDIA/apex\n",
        "!cd apex\n",
        "!pip install -v --disable-pip-version-check --no-cache-dir --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\" ./"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Hly5Ul1PR953"
      },
      "source": [
        "<font color=\"salmon\"> *NOTE if Colab crashes while installing packages restart + run all cells (factory reset **not** required)* </font>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NM36kbRFA6Nc",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "f0e352af-b5b8-49c2-fb04-9af31275ccbd"
      },
      "source": [
        "%%capture\n",
        "# Install the latest release of Haystack in your own environment \n",
        "#! pip install farm-haystack\n",
        "\n",
        "# Install the latest master of Haystack\n",
        "!pip install grpcio-tools==1.34.1\n",
        "!pip install git+https://github.com/deepset-ai/haystack.git\n",
        "!pip install -U tqdm\n",
        "!pip install -U dropbox\n",
        "!pip install -U unidecode\n",
        "!pip install -U clean-text\n",
        "!pip install -U wordninja\n",
        "!pip install 'ray[default]' # because of warning during imports\n",
        "\n",
        "import wordninja\n",
        "from cleantext import clean\n",
        "from tqdm.auto import tqdm\n",
        "import dropbox\n",
        "import joblib"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xmRuhTQ7A6Nh",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "6c45ce2c-12f7-4535-8098-eeddc24b2126"
      },
      "source": [
        " import gc\n",
        "import pprint as pp\n",
        "from datetime import datetime\n",
        "\n",
        "# if it crashes here just restart -> run all\n",
        "from haystack.preprocessor.cleaning import clean_wiki_text\n",
        "from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http\n",
        "from haystack.generator.transformers import Seq2SeqGenerator\n",
        "from haystack.preprocessor.preprocessor import PreProcessor\n",
        "from haystack.document_store.faiss import FAISSDocumentStore\n",
        "from haystack.generator.transformers import RAGenerator\n",
        "from haystack.retriever.dense import DensePassageRetriever\n",
        "import sentencepiece\n",
        "import transformers"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "d3yqOW7zbKHJ"
      },
      "source": [
        "## misc utility functions"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Pq2mysRObLp1",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "4f2d91b9-1f56-42e6-ae3d-733623199357"
      },
      "source": [
        "from IPython.display import clear_output\n",
        "# https://ipython.readthedocs.io/en/stable/api/generated/IPython.display.html\n",
        "def isnotebook():\n",
        "    try:\n",
        "        shell = get_ipython().__class__.__name__\n",
        "        if shell == 'ZMQInteractiveShell':\n",
        "            return True   # Jupyter notebook or qtconsole\n",
        "        elif shell == 'Shell':\n",
        "            return True  # Colab\n",
        "        elif shell == 'TerminalInteractiveShell':\n",
        "            return False  # Terminal running IPython\n",
        "        else:\n",
        "            return False  # Other type (?)\n",
        "    except NameError:\n",
        "        return False      # Probably standard Python interpreter\n",
        "\n",
        "def clear_jupyter_cell():\n",
        "    is_jupyter = isnotebook()\n",
        "\n",
        "    if is_jupyter:\n",
        "        clear_output(wait=False)\n",
        "    else:\n",
        "        print(\"not in a jupyter notebook\")"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZKFlJq0qfp5b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "5ac0937f-bce8-489c-c368-2df98802fb15"
      },
      "source": [
        "# basic SC\n",
        "import re\n",
        "def remove_string_extras(mytext):\n",
        "    # removes everything from a string except A-Za-z0-9 .,;\n",
        "    return re.sub(r'[^A-Za-z0-9 _.,;]+', '', mytext)\n",
        "\n",
        "def corr(s):\n",
        "    # adds space after period if there isn't one\n",
        "    # removes extra spaces\n",
        "    return re.sub(r'\\.(?! )', '. ', re.sub(r' +', ' ', s))"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Ipq9Dq1wRbrH",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "e42b86dc-7cf5-4957-8942-81d9d008b48c"
      },
      "source": [
        "def clean_output(ugly_text, txt_lan=\"en\"):\n",
        "    # a wrapper for clean text with options different than default \n",
        "\n",
        "    # https://pypi.org/project/clean-text/\n",
        "    cleaned_text = clean(ugly_text,\n",
        "                        fix_unicode=True,               # fix various unicode errors\n",
        "                        to_ascii=True,                  # transliterate to closest ASCII representation\n",
        "                        lower=False,                     # lowercase text\n",
        "                        no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them\n",
        "                        no_urls=True,                  # replace all URLs with a special token\n",
        "                        no_emails=True,                # replace all email addresses with a special token\n",
        "                        no_phone_numbers=True,         # replace all phone numbers with a special token\n",
        "                        no_numbers=False,               # replace all numbers with a special token\n",
        "                        no_digits=False,                # replace all digits with a special token\n",
        "                        no_currency_symbols=True,      # replace all currency symbols with a special token\n",
        "                        no_punct=False,                 # remove punctuations\n",
        "                        replace_with_punct=\"\",          # instead of removing punctuations you may replace them\n",
        "                        replace_with_url=\"<URL>\",\n",
        "                        replace_with_email=\"<EMAIL>\",\n",
        "                        replace_with_phone_number=\"<PHONE>\",\n",
        "                        replace_with_number=\"<NUM>\",\n",
        "                        replace_with_digit=\"0\",\n",
        "                        replace_with_currency_symbol=\"<CUR>\",\n",
        "                        lang=txt_lan                       # set to 'de' for German special handling\n",
        "                        )\n",
        "\n",
        "    return cleaned_text"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jBg44J-cedxd",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "dd67c3ef-7d77-484d-faf7-4c5112c79cea"
      },
      "source": [
        "def beautify_filename(filename, num_words=20, start_reverse=False, \n",
        "                      word_separator=\"_\"):\n",
        "    # takes a filename stored as text, removes extension, separates into X words ...\n",
        "    # and returns a nice filename with the words separateed by \n",
        "    # useful for when you are reading files, doing things to them, and making new files \n",
        "    \n",
        "    filename = str(filename)\n",
        "    index_file_Ext = filename.rfind('.')\n",
        "    current_name = str(filename)[:index_file_Ext]  # get rid of extension\n",
        "    if current_name[-1].isnumeric():\n",
        "        current_name = current_name + \"s\"\n",
        "    clean_name = clean_output(current_name)\n",
        "    file_words = wordninja.split(clean_name)\n",
        "    # splits concatenated text into a list of words based on common word freq\n",
        "    if len(file_words) <= num_words:\n",
        "        num_words = len(file_words)\n",
        "\n",
        "    if start_reverse:\n",
        "        t_file_words = file_words[-num_words:]\n",
        "    else:\n",
        "        t_file_words = file_words[:num_words]\n",
        "\n",
        "    pretty_name = word_separator.join(t_file_words) # see function argument\n",
        "\n",
        "    # NOTE IT DOES NOT RETURN THE EXTENSION\n",
        "    return pretty_name[: (len(pretty_name) - 1)]  # there is a space always at the end, so -1\n"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "y_fvs3GoRnr8"
      },
      "source": [
        "download & extract zipped folder"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "k9j223dKRif-",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "6c7220e0-c5ae-4a89-ee2f-32ba5f61cecf"
      },
      "source": [
        "import re, os, shutil\n",
        "def URL_string_filter(text):\n",
        "    custom_printable = \"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._\"\n",
        "\n",
        "    filtered = ''.join((filter(lambda i: i in custom_printable, text)))\n",
        "\n",
        "    return filtered \n",
        "\n",
        "def getFilename_fromCd(cd):\n",
        "\n",
        "    if not cd:\n",
        "        return None\n",
        "    fname = re.findall('filename=(.+)', cd)\n",
        "    if len(fname) > 0:\n",
        "        output = fname[0]\n",
        "    elif cd.find('/'):\n",
        "        possible_fname = url.rsplit('/', 1)[1]\n",
        "        output = URL_string_filter(possible_fname)\n",
        "    else:\n",
        "        output = None\n",
        "    return output"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FxQ5vzywRigD",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "cc4dc9dd-e507-4abe-e8da-83ba7f8e1579"
      },
      "source": [
        "import shutil, lzma, bz2, zlib # zipfile formats\n",
        "import requests\n",
        "from os.path import getsize, join, isdir\n",
        "from datetime import datetime\n",
        "\n",
        "def get_zip_URL(URLtoget, extract_loc=None,\n",
        "                file_header=\"zipexport_\", verbose=False):\n",
        "    \n",
        "    r = requests.get(URLtoget, allow_redirects=True)\n",
        "    names = getFilename_fromCd(r.headers.get('content-disposition'))\n",
        "    try:\n",
        "        fixed_fnames = names.split(\";\") # split the multiple results\n",
        "        this_filename = file_header + URL_string_filter(fixed_fnames[0])\n",
        "    except:\n",
        "        this_filename = file_header + \".tar.gz\"\n",
        "        print(\"has no filename, using default of {}\".format(this_filename))\n",
        "\n",
        "    # define paths and save the zip file\n",
        "    if extract_loc is None:\n",
        "        extract_loc = \"zip_download\"\n",
        "    dl_place = join(os.getcwd(), extract_loc)\n",
        "    os.makedirs(dl_place, exist_ok=True)\n",
        "    save_loc = join(os.getcwd(), this_filename)\n",
        "    open(save_loc, 'wb').write(r.content)\n",
        "    if verbose: print(\"downloaded file size was {} MB\".format(getsize(save_loc)/1000000))\n",
        "\n",
        "    # unpack the archive\n",
        "    shutil.unpack_archive(save_loc, extract_dir=dl_place)\n",
        "    if verbose: \n",
        "        print(\"extracted zip file - \", datetime.now())\n",
        "        filelist = []\n",
        "\n",
        "        for root, dirs, files in os.walk(dl_place):\n",
        "            for file in files:\n",
        "                #append the file name to the list\n",
        "                filelist.append(os.path.join(root,file))\n",
        "\n",
        "        print(\"a total of {} files in {}\".format(len(filelist), dl_place))\n",
        "\n",
        "    # remove original\n",
        "    try:\n",
        "        os.remove(save_loc)\n",
        "        del save_loc\n",
        "    except:\n",
        "        print(\"unable to delete original zipfile - check if exists\",\n",
        "              datetime.now())\n",
        "\n",
        "    print(\"finished extracting link - \", datetime.now())\n",
        "\n",
        "    return dl_place"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "u5lpLeEO-pVc"
      },
      "source": [
        "download a file from colab"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bbmuHa3c-rQh",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "579b2fe6-8568-4511-84da-4371b879dba3"
      },
      "source": [
        "from google.colab import files\n",
        "from os.path import basename \n",
        "\n",
        "def download_file(my_path, verbose=False):\n",
        "\n",
        "    files.download(my_path)\n",
        "    if verbose: print(\"initiated download of {} - \".format(basename(my_path)),\n",
        "                      datetime.now())"
      ],
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TZ7ZrQfrJmQQ",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "e2328506-6b93-4b78-b670-d6dd74dc8af2"
      },
      "source": [
        "def get_size_mb(path2file, verbose=False):\n",
        "\n",
        "    file_stats = os.stat(path2file)\n",
        "\n",
        "    file_size_mb = {file_stats.st_size / (1024 * 1024)}\n",
        "    if verbose: print(f'File Size in MegaBytes is {file_size_mb}')\n",
        "    return round(list(file_size_mb)[0],2) # returns rounded to 2 decimals"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hcu7JY0fNy81"
      },
      "source": [
        "# process documents"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "q3dSo7ZtA6Nl"
      },
      "source": [
        "### Document Store\n",
        "\n",
        "> FAISS is a library for efficient similarity search on a cluster of dense vectors.\n",
        "The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood\n",
        "to store the document text and other meta data. The vector embeddings of the text are\n",
        "indexed on a FAISS Index that later is queried for searching answers.\n",
        "The default flavour of FAISSDocumentStore is \"Flat\" but can also be set to \"HNSW\" for\n",
        "faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.\n",
        "For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n",
        "\n",
        "- [link](https://haystack.deepset.ai/docs/latest/apidatabasemd#Module-faiss) to haystack API on doc store\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "pycharm": {
          "name": "#%% md\n"
        },
        "id": "ggPXF9rWhW2h"
      },
      "source": [
        "### Cleaning & indexing documents\n",
        "\n",
        "download, convert and index to DocumentStore"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WMHL9-v7TYHn",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "cellView": "form",
        "outputId": "3513ca9e-57eb-420b-8adf-e03139415302"
      },
      "source": [
        "URL_to_archive = \"https://www.dropbox.com/sh/65nm4ks3m1wlvgn/AACz3XyHxEZc1HaPTYaXAo0va?dl=1\" #@param {type:\"string\"}\n",
        "use_elasticsearch = True #@param {type:\"boolean\"}\n"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0sP88vM04-7x",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 103
        },
        "outputId": "644c778c-b88e-44b4-a4fb-7dfdc0555dbd"
      },
      "source": [
        "%%time\n",
        "if use_elasticsearch:\n",
        "    from haystack.utils import launch_es\n",
        "    launch_es()\n",
        "\n",
        "    # In Colab / No Docker environments: Start Elasticsearch from source\n",
        "    ! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
        "    ! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
        "    ! chown -R daemon:daemon elasticsearch-7.9.2\n",
        "\n",
        "    import os\n",
        "    from subprocess import Popen, PIPE, STDOUT\n",
        "    es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
        "                    stdout=PIPE, stderr=STDOUT,\n",
        "                    preexec_fn=lambda: os.setuid(1)  # as daemon\n",
        "                    )\n",
        "    # wait until ES has started\n",
        "    ! sleep 30"
      ],
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "08/19/2021 22:44:12 - INFO - haystack.utils -   Starting Elasticsearch ...\n",
            "08/19/2021 22:44:12 - WARNING - haystack.utils -   Tried to start Elasticsearch through Docker but this failed. It is likely that there is already an existing Elasticsearch instance running. \n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": [
            "CPU times: user 375 ms, sys: 154 ms, total: 529 ms\n",
            "Wall time: 59.7 s\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6L_gvC6r5WkM",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 103
        },
        "outputId": "239a6462-3613-4230-dda0-a5b0d4f093d9"
      },
      "source": [
        "if use_elasticsearch:\n",
        "    from haystack.document_store.elasticsearch import ElasticsearchDocumentStore\n",
        "    # use local instance until decide on online provider\n",
        "    document_store = ElasticsearchDocumentStore(host=\"localhost\", \n",
        "                                                username=\"\", password=\"\", \n",
        "                                                index=\"document\",\n",
        "                                                return_embedding=True,\n",
        "                                                duplicate_documents='skip',\n",
        "                                                similarity=\"cosine\",\n",
        "                                                )\n"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "08/19/2021 22:45:12 - INFO - elasticsearch -   HEAD http://localhost:9200/ [status:200 request:0.076s]\n",
            "08/19/2021 22:45:12 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.010s]\n",
            "08/19/2021 22:45:12 - INFO - elasticsearch -   GET http://localhost:9200/document [status:200 request:0.005s]\n",
            "08/19/2021 22:45:12 - INFO - elasticsearch -   PUT http://localhost:9200/document/_mapping [status:200 request:0.029s]\n",
            "08/19/2021 22:45:12 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.003s]\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1cYgDJmrA6Nv",
        "pycharm": {
          "name": "#%%\n"
        },
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "332d8103-e4e3-4852-b458-221fcefd3f9d"
      },
      "source": [
        "from haystack.document_store.faiss import FAISSDocumentStore\n",
        "\n",
        "\n",
        "if not use_elasticsearch:\n",
        "\n",
        "    document_store = FAISSDocumentStore(\n",
        "        faiss_index_factory_str=\"Flat\",\n",
        "        return_embedding=True,\n",
        "        # similarity=\"cosine\",\n",
        "        progress_bar=True,\n",
        "        duplicate_documents='skip',\n",
        "\n",
        "    )"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YK3G4UV3RhpG"
      },
      "source": [
        "The hyperparameter `words_per_doc` *drastically* changes performance and HayStack recommends leaving it at 100. Would only make sense changing if the relevant results of the intended queries are super long **and** the amount of documents itself is relatively small"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "cellView": "form",
        "id": "Oc86ThopfGJD",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "294ff3d4-cf8a-4560-dcb6-cdd776f341ff"
      },
      "source": [
        "words_per_doc =  100#@param {type:\"integer\"}\n"
      ],
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "pycharm": {
          "name": "#%%\n"
        },
        "id": "1I0c27eXhW2i"
      },
      "source": [
        "%%capture\n",
        "# custom dataset loaded above\n",
        "path_to_dataset = get_zip_URL(URL_to_archive, extract_loc=\"text_corpus\",\n",
        "                              file_header=\"source_text\", verbose=True)\n",
        "\n",
        "# Convert files to dicts\n",
        "dicts = convert_files_to_dicts(dir_path=path_to_dataset,\n",
        "                               clean_func=clean_wiki_text, \n",
        "                               split_paragraphs=True,)\n",
        "preprocessor = PreProcessor(\n",
        "    clean_empty_lines=True,\n",
        "    clean_whitespace=True,\n",
        "    clean_header_footer=True,\n",
        "    split_by=\"word\",\n",
        "    split_length=words_per_doc,\n",
        "    split_respect_sentence_boundary=True\n",
        ")\n",
        "nested_docs = [preprocessor.process(d) for d in dicts]\n",
        "docs = [d for x in nested_docs for d in x]\n",
        "\n",
        "# this part will write all the docs in the docstore to the local database\n",
        "document_store.delete_documents()\n",
        "\n",
        "document_store.write_documents(docs, duplicate_documents='skip', \n",
        "                               batch_size=30000)\n",
        "# it will print a bunch of things\n",
        "clear_jupyter_cell()\n",
        "print(\"documents written - \", datetime.now())"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VouRJydoPysN"
      },
      "source": [
        "# Info Retrieval Architecture"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qeFya9Wl-8gj"
      },
      "source": [
        "## Retriever\n",
        "\n",
        "- tutorial originally uses a `RetribertRetriever` and  invokes `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore`:\n",
        "\n",
        "```\n",
        "retriever = EmbeddingRetriever(document_store=document_store,\n",
        "                               embedding_model=\"yjernite/retribert-base-uncased\",\n",
        "                               model_format=\"retribert\")\n",
        "```\n",
        "\n",
        "- based on some results from the other tutorials, `DensePassageRetriever` is better but slower. it is used here."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kFwiPP60A6N7",
        "pycharm": {
          "is_executing": true
        },
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 186,
          "referenced_widgets": [
            "90d338fc12f24a0f9f2c2cb4fc07a555",
            "768cd79789a44724872bb6f315f01667",
            "fa5ecb3f7c244f32ae847714cb212923",
            "9cf3d45a7395476685748dc565f42dfe",
            "eb7b79ea89f64e2c890d3e0dcb60e0de",
            "81097802618746b399b2788a14635853",
            "d90f3d0362c54b61a76919bd2c85bc2d",
            "4386721c9689477cb52f10f82e3efb97",
            "99d8e8d3231b4e99a983b8bf7bc7a5e2",
            "6eb390f7983b4c9384156d2a48d2f3ac",
            "b463b88099b145d9b8c90c0c361411f2",
            "49d9beb622bc4083bda44d153cde09d5",
            "9bc7d217fd494f3babd6b1f559dcd0e0",
            "64c09b2062be42daab3e4642f5377ac2",
            "0f8c70e244db410f926a06ef57322375",
            "4e7f0a5a083f4ac78ece5e310b0fb98f",
            "848c259c189d4308a23b3de278c544e3",
            "14b280c4f7f142f0976148b8f95d549b",
            "7694f4df92b2440a99acce949ce24d49",
            "911f175505c14d3d9d0cd82285f68d37",
            "8f10292e23f049b68fe51a8d4b343e67",
            "2fa250bbb6f5473c91570ca5ce4ab553",
            "a1c248f9f3854ab996d683acd10db563",
            "c01ef5167c704b719cdcbc9455265f20",
            "d7ba15dedcbb475484fee6814351bff9",
            "71a1bbc7dad44bc8a296127e0da6ea64",
            "108c208362a44b35a78d89f41a9f2649",
            "4bf352be47dd45b2ad0465202950c009",
            "64762b258c10418d8665a047481890a0",
            "5a8089e9b8d8447f8d6a72fe46b90d22",
            "82d26a176bcb4719b89b39ceb6040b3e",
            "4d0ee98d72984214b05e0e9c47865ce6",
            "bd91aaafdfde46b5a7357ab5b206a6e2",
            "9438931fa26c4e558d9149a468832438",
            "a3907edd4a7a49edb7cda384b82d94c1",
            "8c81af657f384ec5af715b3c0af2c16b",
            "69eec051a1764886bd80fb85d9afc869",
            "c31ed6dcf1f345ba8884545962a07ec9",
            "27e530f9b7314479b59a4cf96a19a948",
            "229af4dc1b4e44acaa2f338714a72db3",
            "99acdcd0832d41f38de3200dfb94d95f",
            "29284b0cc2e94c058ef9b894af3e8232",
            "96df455b6eec4550b6e905f433c83c69",
            "27c4d59ecce846feb592bc4f10ecdc7a",
            "f4f95246aab9486197b32ebe4e23134b",
            "dac43d8a6dca4d7998e6b8a7786d9fb3",
            "3419519ef95d48ccb1593b747e5c69af",
            "7879d3794894430c8ebcedfd41d78ead",
            "53459fe2ff9447b49cd20d792cde9430",
            "8e5470e49940433aa5215af177064a20",
            "b7e9710de3804072afea61278abccf1c",
            "76bee247b0c44a12b3e7eeaaa574e1a7",
            "0a6b832a2dc84e6ea144282eeb0d1efd",
            "512b5d6622714c6b987152a3e853775d",
            "0a61f786b4764064b9d5d83d6966c8eb"
          ]
        },
        "outputId": "95856356-1d17-4639-9c7b-df50d4a5fe0a"
      },
      "source": [
        "%%time\n",
        "from haystack.retriever.dense import EmbeddingRetriever\n",
        "\n",
        "# dense                            \n",
        "retriever = DensePassageRetriever(\n",
        "    document_store=document_store,\n",
        "    query_embedding_model=\"facebook/dpr-question_encoder-single-nq-base\",\n",
        "    passage_embedding_model=\"facebook/dpr-ctx_encoder-single-nq-base\",\n",
        "    use_gpu=True,\n",
        "    embed_title=True,\n",
        "    max_seq_len_passage = 256,\n",
        "    max_seq_len_query = 128,\n",
        "    top_k=50, # tested up to 50\n",
        "    use_fast_tokenizers=False, \n",
        "    similarity_function=\"cosine\", # 'dot_product' or 'cosine'\n",
        "    progress_bar=True,\n",
        "    batch_size=16 # works partially on 32\n",
        "\n",
        ")\n",
        "\n",
        "# this part is where the text embeddings are generated / updated. \n",
        "# this will take the longest amount of time (10-20 minutes)\n",
        "document_store.update_embeddings(retriever, update_existing_embeddings=False)\n",
        "                                 \n",
        "clear_jupyter_cell()\n",
        "print(\"Finished! - \", datetime.now(), \"\\n\\n\")"
      ],
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Finished! -  2021-08-19 22:56:23.987868 \n",
            "\n",
            "\n",
            "CPU times: user 7min 4s, sys: 5.34 s, total: 7min 9s\n",
            "Wall time: 9min 15s\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sMlVEnJ2NkZZ"
      },
      "source": [
        "#### validate retriever\n",
        "\n",
        "- Before blindly using the `Retriever` part of the pipeline,empirically test it to make sure a simple search indeed finds the relevant documents.\n",
        "- the documents printed out (and text) should be relevant to the query. If not, the questions part won't work"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "cellView": "form",
        "id": "gpKCew7WU8kr",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "9130247f-765d-4009-f579-e3c63809c95b"
      },
      "source": [
        "test_query = \"dimensionality reduction\" #@param {type:\"string\"}\n"
      ],
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qpu-t9rndgpe",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 531
        },
        "outputId": "9148f2f4-6e54-42d8-fe12-69b8d99f306f"
      },
      "source": [
        "from haystack.utils import print_answers, print_documents\n",
        "from haystack.pipeline import DocumentSearchPipeline\n",
        "\n",
        "p_retrieval = DocumentSearchPipeline(retriever)\n",
        "res = p_retrieval.run(\n",
        "    query=test_query,\n",
        "    top_k_retriever=5\n",
        ")\n",
        "print_documents(res, max_text_len=256)\n"
      ],
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "Query: dimensionality reduction\n",
            "\n",
            "{   'name': 'mlpp_3528_deep_learnin.txt',\n",
            "    'text': 'Each hidden unit is obtained by convolving with the appropriate '\n",
            "            'filter; and then summing over the input planes  The final layer '\n",
            "            'is obtained by computing the local maximum within a small window. '\n",
            "            'Source:   Figure 1 of (Chen et al. 2010) Used with kind permis...'}\n",
            "\n",
            "{   'name': 'mlpp_2922_more_variation_al_inferenc.txt',\n",
            "    'text': '2011, Theorem 14). (The troublesome cases  arise when there are   '\n",
            "            'fractional assignments  with the same optimal value as the MAP '\n",
            "            'estimate.)'}\n",
            "\n",
            "{   'name': 'ISL2_53_linear_regressio.txt',\n",
            "    'text': 'The blue lines track the outer quantiles of the residuals, and '\n",
            "            'emphasize patterns. Left: The funnel shape indicates '\n",
            "            'heteroscedasticity: Right: The response has been log transformed, '\n",
            "            'and there is now no evidence of heteroscedasticity: terms can '\n",
            "            'also occur o...'}\n",
            "\n",
            "{   'name': 'mlpp_81_introductio.txt',\n",
            "    'text': 'The poor performance in high dimensional settings is due to the '\n",
            "            'curse of dimensionality: To explain the curse, we give some '\n",
            "            'examples from (Hastie et al. 2009, p22).'}\n",
            "\n",
            "{   'name': 'ESL__42_overview_of_supervised_learnin.txt',\n",
            "    'text': 'It would seem that with a reasonably large set of training data, '\n",
            "            'we could always approximate the theoretically optimal conditional '\n",
            "            'expectation by k-nearest-neighbor averaging; since we should be '\n",
            "            'able to find a fairly large neighborhood of observations clos...'}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rnVR28OXA6OA"
      },
      "source": [
        "## Generator\n",
        "\n",
        "now initalize the reader/generator.\n",
        "\n",
        "The default is a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see definition [here](https://huggingface.co/yjernite/bart_eli5))\n",
        "\n",
        "- usable models for generating answers are listed [here](https://huggingface.co/models?pipeline_tag=text2text-generation)\n",
        "    - *NOTE all custom generators need to be \"text 2 text\"*\n",
        "- [original implementation](https://yjernite.github.io/lfqa.html) of the bart Eli5 model\n",
        "\n",
        "\n",
        "\n",
        "---\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Wmd3HbeHd1JP"
      },
      "source": [
        "#### Custom Converter - How-To\n",
        "\n",
        "Need to pass in a pre-defined class object to Seq2SeqGenerator as a `input_converter`:\n",
        "definition:\n",
        "```\n",
        "input_converter: an optional Callable to prepare model input for the underlying language model specified in model_name_or_path parameter.\n",
        " The required call method signature for the Callable is: \n",
        " call(tokenizer: PreTrainedTokenizer, query: str, documents: List[Document], top_k: Optional[int] = None) -> BatchEncoding:\n",
        "```\n",
        "\n",
        "- [source](https://github.com/deepset-ai/haystack/pull/1086)\n",
        "\n",
        "**links for custom converter help**\n",
        "\n",
        "1. transformers [generation mixin docs](https://huggingface.co/transformers/main_classes/model.html?transformers.generation_utils.GenerationMixin#transformers.generation_utils.GenerationMixin)\n",
        "2. Blog post [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate)\n",
        "3. check the github of haystack and search for `Seq2SeqGenerator` definition to find where the original class is defined (there is one for the default model)\n",
        "    - a [current link](https://github.com/deepset-ai/haystack/blob/17dcb8c23e2e79391965f84c80eff58522c65c52/haystack/generator/transformers.py) to the file. in `haystack/generator/transformers.py`\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yzj-ZAfOQtVa"
      },
      "source": [
        "> <font color=\"salmon\"> in this next section you define classes for the converter. basically check [huggingface docs](https://huggingface.co/transformers/model_doc/t5.html) for what the tokenizer wants, convert the input documents etc to that. </font>\n",
        "\n",
        "> ```Model type should be one of BigBirdPegasusConfig, M2M100Config, LEDConfig, BlenderbotSmallConfig, MT5Config, T5Config, PegasusConfig, MarianConfig, MBartConfig, BlenderbotConfig, BartConfig, FSMTConfig, EncoderDecoderConfig, XLMProphetNetConfig, ProphetNetConfig.```"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "toVrGrd-Qrgw",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "271a50c8-d5ac-437f-fe2b-a48e9bde84b8"
      },
      "source": [
        "from transformers import RagTokenizer, RagTokenForGeneration, AutoTokenizer, \\\n",
        "    AutoModelForSeq2SeqLM, PreTrainedTokenizer, BatchEncoding\n",
        "from haystack import Document\n",
        "from typing import Any, Dict, List, Optional\n",
        "\n",
        "class _PegParaPhraseConv:\n",
        "    \"\"\"\n",
        "       A sequence-to-sequence model input converter (https://huggingface.co/yjernite/bart_eli5) based on the\n",
        "       BART architecture fine-tuned on ELI5 dataset (https://arxiv.org/abs/1907.09190).\n",
        "       \n",
        "       The converter takes documents and a query as input and formats them into a single sequence\n",
        "       that a seq2seq model can use it as input for its generation step. \n",
        "       This includes model-specific prefixes, separation tokens and the actual conversion into tensors.\n",
        "       \n",
        "       For more details refer to Yacine Jernite's excellent LFQA contributions at https://yjernite.github.io/lfqa.html\n",
        "    \"\"\"\n",
        "\n",
        "    def __call__(self, tokenizer: PreTrainedTokenizer, query: str, documents: List[Document],\n",
        "                 top_k: Optional[int] = None) -> BatchEncoding:\n",
        "        # if there are specific required here add them (model dependent)\n",
        "        conditioned_doc = \" \".join([d.text for d in documents])\n",
        "\n",
        "        # concatenate question and support document into BART input\n",
        "        query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
        "\n",
        "        return tokenizer([query_and_docs],truncation=True, padding='longest',\n",
        "                          max_length=512, return_tensors=\"pt\")\n",
        "\n",
        "class _T5asSummary:\n",
        "    # use case: t5 one line summary\n",
        "    def __call__(self, tokenizer: PreTrainedTokenizer, query: str, documents: List[Document],\n",
        "                 top_k: Optional[int] = None) -> BatchEncoding:\n",
        "        # if there are specific required here add them (model dependent)\n",
        "        conditioned_doc = \" \".join([d.text for d in documents])\n",
        "\n",
        "        # concatenate question and support document into BART input\n",
        "        query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
        "\n",
        "        return tokenizer(\"summarize: \" + query_and_docs, truncation=True, \n",
        "                         padding='longest', return_tensors=\"pt\")\n",
        "        \n",
        "\n",
        "# https://huggingface.co/transformers/model_doc/gpt_neo.html\n",
        "class _T5asQA:\n",
        "    # use case - all other instances of T5 in the generator\n",
        "    def __call__(self, tokenizer: PreTrainedTokenizer, query: str, documents: List[Document],\n",
        "                 top_k: Optional[int] = None) -> BatchEncoding:\n",
        "        # if there are specific required here add them (model dependent)\n",
        "        conditioned_doc = \" \".join([d.text for d in documents])\n",
        "\n",
        "        # concatenate question and support document into BART input\n",
        "        query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
        "\n",
        "        return tokenizer(query_and_docs, truncation=True, \n",
        "                         padding='longest', return_tensors=\"pt\")\n",
        "\n",
        "class _LEDasQA:\n",
        "    ## for allen ai longformer\n",
        "    def __call__(self, tokenizer: PreTrainedTokenizer, query: str, documents: List[Document],\n",
        "                top_k: Optional[int] = None) -> BatchEncoding:\n",
        "        # if there are specific required here add them (model dependent)\n",
        "        conditioned_doc = \" \".join([d.text for d in documents])\n",
        "\n",
        "        # concatenate question and support document into model\n",
        "        query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
        "\n",
        "        return tokenizer(query_and_docs, truncation=True, \n",
        "                            return_tensors=\"pt\")\n",
        "        \n",
        "class _BigBirdforQA:\n",
        "    # for google's pegasus-bigbird\n",
        "    def __call__(self, tokenizer: PreTrainedTokenizer, query: str, documents: List[Document],\n",
        "                top_k: Optional[int] = None) -> BatchEncoding:\n",
        "        # if there are specific required here add them (model dependent)\n",
        "        conditioned_doc = \" \".join([d.text for d in documents])\n",
        "\n",
        "        # concatenate question and support document into model\n",
        "        query_and_docs = \"question: {} context: {}\".format(query, conditioned_doc)\n",
        "\n",
        "        return tokenizer(query_and_docs, truncation=True, padding='longest',\n",
        "                          max_length=512, return_tensors=\"pt\")\n"
      ],
      "execution_count": 24,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AUmAPeSqKUBh"
      },
      "source": [
        "mapping dict that tells haystack what tokenizer variant to use for a model"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3oC62rpBaHYc",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "4edab4d9-8780-41ee-9d14-c138d7ade90d"
      },
      "source": [
        "# create dict object of model name to converter\n",
        "model_converters = {\n",
        "    \"ramsrigouthamg/t5_paraphraser\":_T5asQA(), # works, decent\n",
        "    \"tuner007/pegasus_qa\":_PegParaPhraseConv(), # wroks, really good\n",
        "    \"google/t5-v1_1-large\":_T5asQA(), # works most of the time\n",
        "    \"allenai/unifiedqa-t5-large\":_T5asQA(),  # works & good\n",
        "    \"google/t5-large-ssm-nq\":_T5asQA(), \n",
        "    \"google/t5-large-ssm-nqo\":_T5asQA(), \n",
        "    \"google/t5-large-ssm\": _T5asQA(), # crashes, need to investigate\n",
        "    \"allenai/led-base-16384\": _LEDasQA(),# works & mediocre\n",
        "    \"google/pegasus-big_patent\": _PegParaPhraseConv(), # works & good\n",
        "    \"vasudevgupta/bigbird-pegasus-large-bigpatent\": _BigBirdforQA(),  # works + GOAT\n",
        "    \"google/bigbird-pegasus-large-bigpatent\": _BigBirdforQA(), \n",
        "    \"google/bigbird-pegasus-large-arxiv\": _BigBirdforQA(), \n",
        "    \"valhalla/distilt5-qa-qg-hl-12-6\":_T5asQA(), # works, meh\n",
        "    'akshara23/Pegasus_for_Here':_PegParaPhraseConv(), # works well\n",
        "    \"tuner007/pegasus_paraphrase\":_PegParaPhraseConv(),\n",
        "    \"google/pegasus-reddit_tifu\":_PegParaPhraseConv(),\n",
        "    \"google/pegasus-large\":_PegParaPhraseConv(),\n",
        "}\n",
        "\n",
        "# SSM nq might need it's own class  - input_ids = t5_tok(\"When was Franklin D. Roosevelt born?\", return_tensors=\"pt\").input_ids\n"
      ],
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LaCFopr_H-f0"
      },
      "source": [
        "### Select model\n",
        "\n",
        "- the model used from huggingface.co as a generator should be defined above\n",
        "- the default model `yjernite/bart_eli5` has decent outputs but the usefulness declines the more complicated the question is. (to use this, uncheck `use_custom_model`)\n",
        "- **In general, best results are from `akshara23/Pegasus_for_Here`, `vasudevgupta/bigbird-pegasus-large-bigpatent`, and `google/pegasus-reddit_tifu`**\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lK5wWFo6bg8c",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "cellView": "form",
        "outputId": "98671341-1e51-4938-92ef-fbb8f549b722"
      },
      "source": [
        "# decrease_if_crash (change model type)\n",
        "cust_model_name = \"google/bigbird-pegasus-large-bigpatent\" #@param [\"ramsrigouthamg/t5_paraphraser\", \"google/t5-v1_1-large\", \"allenai/unifiedqa-t5-large\", \"allenai/led-base-16384\", \"google/bigbird-pegasus-large-arxiv\", \"akshara23/Pegasus_for_Here\", \"google/pegasus-reddit_tifu\", \"google/bigbird-pegasus-large-bigpatent\", \"tuner007/pegasus_qa\", \"google/pegasus-large\"]\n",
        "use_custom_model = True #@param {type:\"boolean\"}\n",
        "model_min_l =  200#@param {type:\"integer\"}\n",
        "model_max_l =  2048#@param {type:\"integer\"}\n",
        "download_answers_txtfile = True #@param {type:\"boolean\"}"
      ],
      "execution_count": 26,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "B1BmQ6c0Q1vS"
      },
      "source": [
        "### build generator"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bDRf3x5CXbhV",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "cellView": "form",
        "outputId": "960dd544-3313-42d9-9586-f7f04d147ad6"
      },
      "source": [
        "number_beam_search =  32#@param {type:\"integer\"}\n",
        "# decrease_if_crash\n"
      ],
      "execution_count": 27,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fyIuWVwhA6OB",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 86
        },
        "outputId": "9c370a06-98d6-460c-edd6-920f326534bd"
      },
      "source": [
        "gc.collect()\n",
        "if \"t5\" in cust_model_name and \"large\" in cust_model_name:\n",
        "    generator = Seq2SeqGenerator(model_name_or_path=cust_model_name,\n",
        "                                 input_converter=model_converters.get(cust_model_name),\n",
        "                                 min_length=model_min_l, max_length=model_max_l, \n",
        "                                 num_beams=4) # to not overload\n",
        "elif use_custom_model:\n",
        "    generator = Seq2SeqGenerator(model_name_or_path=cust_model_name,\n",
        "                                 input_converter=model_converters.get(cust_model_name),\n",
        "                                 min_length=model_min_l, max_length=model_max_l, \n",
        "                                 num_beams=number_beam_search)\n",
        "else:\n",
        "    generator = Seq2SeqGenerator(model_name_or_path=\"yjernite/bart_eli5\", \n",
        "                                 min_length=model_min_l, max_length=model_max_l, \n",
        "                                 num_beams=number_beam_search)\n",
        "    cust_model_name = \"yjernite/bart_eli5\""
      ],
      "execution_count": 28,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "08/19/2021 22:56:25 - INFO - farm.utils -   Using device: CUDA \n",
            "08/19/2021 22:56:25 - INFO - farm.utils -   Number of GPUs: 1\n",
            "08/19/2021 22:56:25 - INFO - farm.utils -   Distributed Training: False\n",
            "08/19/2021 22:56:25 - INFO - farm.utils -   Automatic Mixed Precision: None\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MYjjRnNulvfj"
      },
      "source": [
        "---"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "unhLD18yA6OF"
      },
      "source": [
        "# <center> Question-Answer Pipeline </center>\n",
        "\n",
        "- **Use case:** \"I want to know a specific detail about a concept or how multiple concepts work together\"\n",
        "- _This is the part where you can ask questions to your course documents directly._\n",
        "- Use a Haystack `Pipeline` to build a search pipeline.\n",
        "> Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.\n",
        "To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions.\n",
        "You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "cellView": "form",
        "id": "5A8O9w7PBgvg",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "42174491-8c17-4f9d-a274-eb1b5cf6d4ff"
      },
      "source": [
        "questions_version = \"v1\" #@param {type:\"string\"}\n"
      ],
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TssPQyzWA6OG",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "d21440a4-19d4-474f-ddb9-9fb2233ccc09"
      },
      "source": [
        "from haystack.pipeline import GenerativeQAPipeline\n",
        "pipe = GenerativeQAPipeline(generator, retriever)\n",
        "\n",
        "print(\"generated QA pipeline off of textgen {} - \".format(cust_model_name), datetime.now())\n",
        "# note: cust_model_name gets overwritten above, in case of \"default\" bart_eli5"
      ],
      "execution_count": 30,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "generated QA pipeline off of textgen google/bigbird-pegasus-large-bigpatent -  2021-08-19 22:56:39.346721\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bXlBBxKXA6OL"
      },
      "source": [
        "## Single-Question Queries\n",
        "\n",
        "- <font color=\"salmon\"> **this is where questions should start to be entered** </font>\n",
        "- <font color=\"salmon\"> NOTE that the default model - `yjernite/bart_eli5`-  really does not like the word \"**how**\" and will just sound incredulous that you asked it something</font>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "atBL7GaqOc55",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "1841ef39-b704-4b8b-b90a-746b88f7062c"
      },
      "source": [
        "# create records dataframe \n",
        "import pandas as pd\n",
        "info_queries = pd.DataFrame(columns=[\"query\", \"response\", \"query_type\", \"doc_group\", \"model_name\", \"context\"])"
      ],
      "execution_count": 31,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "F2K4-aeVYOdd",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 377
        },
        "cellView": "form",
        "outputId": "57032d4c-890e-472f-9e39-0846fc3b9315"
      },
      "source": [
        "this_query = \"What is the k-means algorithm?\" #@param {type:\"string\"}\n",
        "topk_search =  100#@param {type:\"integer\"} # decrease_if_crash\n",
        "\n",
        "this_response = clean_output(pipe.run(query=this_query, \n",
        "                                      top_k_retriever=topk_search,\n",
        "                                      top_k_generator=1))\n",
        "\n",
        "this_response = pipe.run(query=this_query, \n",
        "                         top_k_retriever=topk_search,\n",
        "                         top_k_generator=1)\n",
        "resp_text = clean_output(this_response['answers'])\n",
        "new_row = {'query':this_query, \n",
        "           'response':resp_text, \n",
        "           'query_type':\"QA\", \n",
        "           'doc_group':course_name,\n",
        "           'model_name':cust_model_name,\n",
        "           'context':'NA',}\n",
        "info_queries = info_queries.append(new_row, ignore_index=True)\n",
        "print(this_query, \"\\n\\n\")\n",
        "pp.pprint(resp_text, indent=4)\n"
      ],
      "execution_count": 32,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3.Changing attention type to 'original_full'...\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": [
            "What is the k-means algorithm? \n",
            "\n",
            "\n",
            "(\"['The k-means algorithm is one of the most popular iterative descent clus- \"\n",
            " 'tering methods intended for situations in which all variables are of the '\n",
            " 'quantitative type, and that for each such assignment there is a unique '\n",
            " 'optimum for the K pk, the algorithm must converge after a finite number of '\n",
            " 'iterations. It is not clear that the hard assignment to the nearest cluster '\n",
            " 'is the most appropriate. Since the assignment of observations to clusters at '\n",
            " 'any iteration is a perturbation of that for the previous iteration, only a '\n",
            " 'very small fraction of all possible assignments. However, these algorithms '\n",
            " 'converge to local optima which may be highly suboptimal when compared to the '\n",
            " 'global optimum. In the present invention, the convergence properties of '\n",
            " 'these algorithms are studied and it is shown that if the algorithm converges '\n",
            " 'to a local maximum of L, this will correspond to a global maximum of the log '\n",
            " 'likelihood function; and so in this incremental algorithm is increasing the '\n",
            " 'value of L and if the result no longer changes, a local optimum has been '\n",
            " \"reached.']\")\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Fw-QiM7_WKcT",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 291
        },
        "cellView": "form",
        "outputId": "5528771e-e2e0-4130-e857-107c0dee1150"
      },
      "source": [
        "this_query = \"What is a neural network?\" #@param {type:\"string\"}\n",
        "topk_search =  100#@param {type:\"integer\"} # decrease_if_crash\n",
        "\n",
        "this_response = pipe.run(query=this_query, \n",
        "                         top_k_retriever=topk_search,\n",
        "                         top_k_generator=1)\n",
        "resp_text = clean_output(this_response['answers'])\n",
        "new_row = {'query':this_query, \n",
        "           'response':resp_text, \n",
        "           'query_type':\"QA\", \n",
        "           'doc_group':course_name,\n",
        "           'model_name':cust_model_name,\n",
        "           'context':'NA',}\n",
        "info_queries = info_queries.append(new_row, ignore_index=True)\n",
        "print(this_query, \"\\n\\n\")\n",
        "pp.pprint(resp_text, indent=4)\n"
      ],
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "What is a neural network? \n",
            "\n",
            "\n",
            "(\"['It is not surprising that a neural network might use 20 or 100 such \"\n",
            " 'functions, while the model typically uses fewer terms ; and the '\n",
            " 'nameplateplate derives from the fact that they were first developed as '\n",
            " 'models for the human brain: Each unit represents a neuron, and the '\n",
            " 'connections represent synapses. A special family of convolutional neural '\n",
            " 'networks has evolved for convolutional classifying images such as these, and '\n",
            " 'has shown spectacular success on a neural wide range of problems; and for '\n",
            " 'some learning tasks the response is also a sequence, and s0 the output '\n",
            " 'sequence 01, 02, is explicitly needed; and recurrent neural networks can be '\n",
            " 'quite com- plex plex illustrate their use in two simple applications. This '\n",
            " 'is a division of application Ser. No. 08,00,, and it is submitted with the '\n",
            " 'understanding that it will not be used to interpret or limit the scope or '\n",
            " \"meaning of the claims under 37 C.F.R.. 1.72.']\")\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Y_t5fJLMiii2"
      },
      "source": [
        "### custom QA function\n",
        "\n",
        "- same as above, but create a function that iterates through a list of questions and nicely formats them + the output"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-gGWMwW7ikoU",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "0a11533f-7c4b-47bb-d09a-4350e29db50a"
      },
      "source": [
        "from google.colab import files\n",
        "import pprint as pp\n",
        "def answer_questions(das_pipeline, q_list, k_search=50, num_results_d_disp=1,\n",
        "                     export_txt=True, add_text=\"\", doc_ext=\".txt\"):\n",
        "    # if you wanna just save as markdown change doc_ext=\".md\"\n",
        "    gc.collect()\n",
        "    qa_document = []\n",
        "    global info_queries\n",
        "    spacer = \"\\n\\n----------------\\n\"\n",
        "    for question in tqdm(q_list, total=len(q_list),\n",
        "                         desc=\"answering questions...\"):\n",
        "        print(spacer)\n",
        "        question_text = \"Question {} of {}: {}\".format(q_list.index(question),\n",
        "                                                       len(q_list),\n",
        "                                                       question)\n",
        "        pp.pprint(question_text)\n",
        "        print(\"\\n\")\n",
        "        this_result = pipe.run(query=question, top_k_retriever=k_search,\n",
        "                               top_k_generator=num_results_d_disp)\n",
        "        \n",
        "        this_reply = clean_output(this_result[\"answers\"][0])\n",
        "        pp.pprint(this_reply, indent=5)\n",
        "        # log for text file\n",
        "        qa_document.extend([spacer, \"###\" + question_text + \"\\n\", \"\\nAnswer:\\n\",\n",
        "                            this_reply + \"\\n\\n\"])\n",
        "        # log for CSV\n",
        "        n_row_f = {'query':question, \n",
        "                    'response':this_reply, \n",
        "                    'query_type':\"QA\", \n",
        "                    'doc_group':course_name,\n",
        "                    'model_name':cust_model_name,\n",
        "                    'context':'NA',}\n",
        "        info_queries = info_queries.append(n_row_f, ignore_index=True)\n",
        "\n",
        "\n",
        "\n",
        "    date_time = datetime.now().strftime(\"%m.%d.%Y, %H-%M\")\n",
        "    this_outname = remove_string_extras(\"qa_{}_exported_{}\".format(course_name, \n",
        "                                                                   add_text) + cust_model_name + date_time) + \".txt\"\n",
        "    with open(this_outname, 'w', encoding='utf-8', errors='ignore') as qa_f:\n",
        "        qa_f.writelines(qa_document)\n",
        "    download_file(this_outname)\n",
        "\n",
        "    print(\"Completed QA - \", date_time)"
      ],
      "execution_count": 34,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "F3m7lffOkQZu"
      },
      "source": [
        "## List of Questions\n",
        "\n",
        "- *questions can be added / removed, leave it as a list*"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BC1IykFQkL-H",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "c4390022-359f-4935-9375-ef408b5a644b"
      },
      "source": [
        "QUESTIONS = [\n",
        "        'What is the process of hyperparameter optimization?',\n",
        "        'What are the properties of a robust regression estimator?',\n",
        "        'What are the fundamental assumptions of a robust regression estimator?',\n",
        "        'What is regularization, and why is it important?',\n",
        "        'What is regularization used for?',\n",
        "        'In general, how do the bias and variance properties of the ridge regression estimator compare to those of the ordinary least squares (OLS) estimator?',\n",
        "        'What is the process to find the optimal weight vector in a ridge regression estimator?',\n",
        "        'How does model complexity change as a function of the regularisation term in a ridge regression estimator?',\n",
        "        'What is the purpose of boosting in regression?',\n",
        "        'What is the purpose of boosting in classification?',\n",
        "        'Is it true that greedy forward selection always selects a model with fewer features than Lasso regression?',\n",
        "        'How can a Support Vector Machine account for cases where the data is not linearly separable in the input space?',\n",
        "        'Does Lloyds algorithm for k-means clustering always find a globally optimal solution?',\n",
        "        'Is it possible to reconstruct a dataset from the first two principal components? If not, what else is required?',\n",
        "    ]"
      ],
      "execution_count": 35,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "whbQFC5xDQA8"
      },
      "source": [
        "## Run list for Question-Answer\n",
        "\n",
        "- depending on database size, number of questions, so on, may need to run the QA list and search term list separately "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "cellView": "form",
        "id": "-jawHqKh8Wms",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "0762ff4f-cd31-4d74-d520-f054bbb60b73"
      },
      "source": [
        "process_QA_list = True #@param {type:\"boolean\"}\n"
      ],
      "execution_count": 36,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "RkyOXXavkMpx",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000,
          "referenced_widgets": [
            "cadfba2ad49c4491bebfeea46cdbfd04",
            "0a7d88f48b844e57883a13932950e40a",
            "d1010b43f7c7492fb43b27d42dc969b8",
            "ecb0228894774a558a532d121c1ab5f1",
            "06773afd7dec4dbf97bb3ed9a6625d90",
            "37c0072536cb4b72b01c63f97af2ed6a",
            "ccebaabfa49542728426f5cc832b8f32",
            "5899fb0c71bb412b9c6e720f6025ed8d",
            "efc60ece34b64c7883fada81364b732b",
            "eabf44e30c384f0ba67d15daf9e6a058",
            "070cf267930d4ad5b37ebedaedefa1fd"
          ]
        },
        "outputId": "3ee4f747-128f-479e-cdc3-9a7f3bb13985"
      },
      "source": [
        "if process_QA_list:\n",
        "    answer_questions(pipe, QUESTIONS, add_text=\"main_{}_\".format(questions_version),\n",
        "                     k_search=100, # decrease_if_crash\n",
        "                     export_txt=download_answers_txtfile)"
      ],
      "execution_count": 37,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "cadfba2ad49c4491bebfeea46cdbfd04",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "answering questions...:   0%|          | 0/16 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "'Question 0 of 16: What is the process of hyperparameter optimization?'\n",
            "\n",
            "\n",
            "('In machine learning, hyperparameter optimization methods are used for '\n",
            " 'training by min- imizing an objective function 0n the training data, but the '\n",
            " 'overall goal is to improve generalization performance. The goal in machine '\n",
            " 'learning does not necessarily need a precise estimate of the min- imum of '\n",
            " 'the objective function, approximate gradients using mini-batch approaches '\n",
            " 'have been widely used, Spatial gradient descent is very effective in '\n",
            " 'large-scale machine learning problems, poorly conditioned convex problems, '\n",
            " 'gradient descent increasinglyzigzags as the gradients point nearly '\n",
            " 'orthogonally to the shortest di- rection to a minimum point; see General '\n",
            " 'question: What is the process of hyperparameter optimized hyperparameter '\n",
            " 'problem solving is partial least squares solving, it uses the response y to '\n",
            " 'construct its directions, its solution path is a nonlinear function of y. In '\n",
            " 'this game, contestants have to solve various word puzzles and answer a '\n",
            " 'variety of trivia questions, but if they answer incorrectly, they lose '\n",
            " 'money: This Abstract is submitted with the understanding that it will not be '\n",
            " 'used to interpret or limit the scope or meaning of the claims.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "'Question 1 of 16: How to compare estimated vs standard errors?'\n",
            "\n",
            "\n",
            "('There are many ways to estimate the error rate. We consider two: cross- '\n",
            " 'validation and probability inequalities. We decided to perform this test '\n",
            " 'only after examining the data and noting that Focal and Discrete had the '\n",
            " 'highest and lowest mean performances. In a sense, this means that we have '\n",
            " 'implicitly performed = 5 hypothesis tests; rather than just one, as '\n",
            " 'discussed in FIG. 3. This test is a nonparametric method for testing whether '\n",
            " 'two distributions are the same_exact test is not an exact test; meaning that '\n",
            " 'it is not based on large sample theory approximations. The tests are '\n",
            " 'independent each with type-I error rate @, then the family-wise error rate. '\n",
            " 'It is the type of hypothesis we would consider when testing whether a '\n",
            " 'treatment differs from a placebo. The additive logistic regression model can '\n",
            " 'be generalized further to han- dle more than two classes, using the '\n",
            " 'multilogit formulation as outlined in Patent FIG. 4, the algorithms for '\n",
            " 'fitting such models are more complex than prior art.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "'Question 2 of 16: How to estimate standard deviation in regression analysis?'\n",
            "\n",
            "\n",
            "('The standard deviation is the posterior standard deviation under a uniform '\n",
            " 'prior and hence no posterior expected loss there is no automatic way of '\n",
            " 'deriving an optimal estimator; unlike the Bayesian case. The additive '\n",
            " 'logistic regression model can be generalized further to han- dle more than '\n",
            " 'two classes, using the multilogit formulation as outlined in FIG. 2. The '\n",
            " 'algorithms for fitting such models are more complex. In this case the '\n",
            " 'estimator is the cross-validation estimate of true error rate. The error '\n",
            " 'rate is the difference between the observed error rate and dashed line. The '\n",
            " 'solid line is the average of the observed and residuals y _ f in regression. '\n",
            " 'The dashed line is an estimate of the error rate that is cross-validated and '\n",
            " 'probability inequalities are used to yield the same solution when applied to '\n",
            " 'the population joint distribution, the same is not true for finite data '\n",
            " 'sets. The model is monotone decreasing functions of theuman margin yf. In '\n",
            " 'classification, the margin plays a role analogous to the residuals.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "'Question 3 of 16: What are the properties of a robust regression estimator?'\n",
            "\n",
            "\n",
            "('The property of a robust regression estimator is to be consistent if it '\n",
            " 'eventually recovers the true parameters that generated the data as the '\n",
            " 'sample size goes to infinity, i.e. 0 0* as Idl. This concept only makes '\n",
            " 'sense if the data actually comes from the specified model with parameters A* '\n",
            " 'which is not usually the case with real data. There is no automatic way of '\n",
            " 'deriving an optimal estimator; unlike the Bayesian case. It combines prior '\n",
            " 'beliefs with data in a principled way; use in Bayesian ference; construct '\n",
            " 'procedures with guaranteed long run performance, such as confidence '\n",
            " 'intervals; use frequentist methods; use sparse methods; Bayesian methods run '\n",
            " 'into problems when the parameter space is high dimensional. In particu- lar, '\n",
            " '95 percent posterior intervals need not contain the true value 95 percent of '\n",
            " 'the time - 11.10 KHz <n> This Abstract is submitted with the understanding '\n",
            " 'that it only will be used to assist in determining, from a cursory '\n",
            " 'inspection, the nature and gist of the technical disclosure.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "('Question 4 of 16: What are the fundamental assumptions of a robust '\n",
            " 'regression estimator?')\n",
            "\n",
            "\n",
            "('The present invention is a method for constructing a robust estimator that '\n",
            " 'is consistent if it eventually recovers the true parameters that generated '\n",
            " 'the data as the sample size goes to infinity, i.e. 0D 0* as Idl. This '\n",
            " 'concept only makes sense if the data actually comes from the specified model '\n",
            " 'with parameters A* which is not usually the case with real data. In this '\n",
            " 'concept there is no prior and hence no posterior posterior expected loss '\n",
            " 'there is not automatic way of deriving an optimal estimator; unlike the '\n",
            " 'Bayesian case. It combines prior beliefs with data in a way; use Bayesian in '\n",
            " 'ference. It constructs procedures with guaranteed long run performance, such '\n",
            " 'as confidence intervals; use frequentist methods. In particu- lar space, 95 '\n",
            " 'percent posterior intervals need not contain the true value 95 percent of '\n",
            " 'the time - 11.10 lar - 12.11 lar under a uniform prior; whereas in the '\n",
            " 'frequency sense, there is a loss function and a likelihood, but the same is '\n",
            " 'not true.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "'Question 5 of 16: What is regularization, and why is it important?'\n",
            "\n",
            "\n",
            "('The regularization term is sometimes called the penalty term, which bi- '\n",
            " 'penalty term ases the vector 0 to be closer to the origin. It has deep '\n",
            " 'relation to regularization ships to the bias-variance trade-off and feature '\n",
            " 'selection. It also appears in probabilistic models as the prior probability '\n",
            " 'of the parameters from that for the posterior distribution to be of the same '\n",
            " 'form as theprior distribution is the prior and the likelihood need be con- '\n",
            " 'jugate jugate will revisit this idea in a probabilistic model: example, we '\n",
            " 'might have conducted a survey, and some people might not have answered '\n",
            " 'certain questions. One or more sensors might have various sensors, some of '\n",
            " 'which fail. This might be known as tree reparameterization, which should not '\n",
            " 'be confused with the more sophisticated tree-reweighted BP which is commonly '\n",
            " 'used definitions of the algorithms that give rise to the same algorithm. The '\n",
            " 'concept of regularization is also discussed in verse problems, probabilistic '\n",
            " 'models, and appli- cation component analysis.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "'Question 6 of 16: What is regularization used for?'\n",
            "\n",
            "\n",
            "('The concept of regularization has its roots in the solution of ill-posed in- '\n",
            " 'verse problems. It has deep relation- regularization ships to the '\n",
            " 'bias-variance trade-off and feature selection. The approach presented here '\n",
            " 'is an approach to regularization called penalty term regularization, which '\n",
            " 'bi- penalty term ases the vector 0 to be closer to the origin, which also '\n",
            " 'appears in probabilistic models as the prior probability of the parameters '\n",
            " 'from that for the posterior distribution to be of the same form as a prior '\n",
            " 'distribution, and the prior and the likelihood need to be con- jugate jugate '\n",
            " 'to be the same for a posterior distribution with a posterior posterior '\n",
            " 'posterior distribution of a different form than the prior posterior '\n",
            " 'distribution. The penalty term is also presented in the form of the '\n",
            " 'orthogonal projection of the data onto a lower dimensional linear space, '\n",
            " 'known as the principal subspace such that the variance of the projected data '\n",
            " 'is maximized, and can be used for appli- cations such as feature extraction, '\n",
            " 'data compression, and data visualization.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "('Question 7 of 16: In general, how do the bias and variance properties of the '\n",
            " 'ridge regression estimator compare to those of the ordinary least squares '\n",
            " '(OLS) estimator?')\n",
            "\n",
            "\n",
            "('This invention describes how the bias and variance properties of the ridge '\n",
            " 'regression estimator compare to those of the ordinary least squares '\n",
            " 'estimator. This is useful when one has a large number of variables with '\n",
            " 'similarly sized coefficients; ridge shrinks their coefficients toward zero, '\n",
            " 'and those of strongly correlated variables toward each other: This '\n",
            " 'regularization via ridge stabilizes the model and al lows all the variables '\n",
            " 'to have their say ; however; that they are derived as posterior modes, that '\n",
            " 'is, maximizers of the posterior. This makes the problem nonsingulal, even if '\n",
            " 'X is not of full rank; and was the main motivation for ridge regression when '\n",
            " 'it was first introduced in statistics. It is not worth the effort for the '\n",
            " 'extra variance incurred. The problem is solved by using the mean and mode as '\n",
            " 'the posterior mean and lassoes estimate for the ridge coefficient estimates '\n",
            " 'for the prostate can- cer example, plotted as functions of <n><n> '\n",
            " 'A.sub.q.sup.2<n><n><n>.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "('Question 8 of 16: What is the process to find the optimal weight vector in a '\n",
            " 'ridge regression estimator?')\n",
            "\n",
            "\n",
            "('This is the process to find the optimal weight vector in a ridge regression '\n",
            " 'estimator. This makes the problem nonsingulal, even if X is not of full '\n",
            " 'rank; and ridge regression was the main motivation for ridge regression when '\n",
            " 'it was first introduced in statistics. A simple example shows the ridge '\n",
            " 'coefficient estimates for the prostate can- cer example, plotted as '\n",
            " 'functions of df, the effective degrees of freedom implied by the penalty '\n",
            " 'This can strengthen this argument through a simple example, taken from '\n",
            " 'assumption that the true population coefficients of these trees arose from a '\n",
            " 'Gaussian distribution; then we know that in a optimal sense the best '\n",
            " 'predictor is ridge regression, we should use an Lz rather than an L1 penalty '\n",
            " 'when fitting the coefficients. A neural network is useful when sparsity '\n",
            " 'ad-hoc methods for doing this were devised with names such as brain damage '\n",
            " 'injuries were devised in 1990s; see e.g., FIG. FIG. 1 and FIG. 3.18 shows '\n",
            " 'coefficient profiles for the different methods.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "('Question 9 of 16: How does model complexity change as a function of the '\n",
            " 'regularisation term in a ridge regression estimator?')\n",
            "\n",
            "\n",
            "('ridge regression is useful when one has a large number of variables with '\n",
            " 'similarly sized coefficients; ridge shrinks their coefficients toward zero, '\n",
            " 'and those of strongly correlated variables toward each other: Although the '\n",
            " 'size of the training sample might not permit all the variables to be in the '\n",
            " 'model, this regularization via ridge stabilizes the model and al lows all '\n",
            " 'the variable to have their say. The second term, A Cj 8 called a shrinkage '\n",
            " 'penalty; is shrinkage small when 81, 8p are close to 0, and s0 it has the '\n",
            " 'effect of shrinking penalty the estimates of B; towards zero. The tuning '\n",
            " 'parameter A serves to control the relative impact of these two terms on the '\n",
            " 'regression coefficient esti- mates 1 = 0, the penalty term has no effect, '\n",
            " 'and ridge regression will produce the least squares estimates The funnel '\n",
            " 'shape indicates heteroscedasticity: The assumption of uncorrelated errors '\n",
            " 'could be violated if some of the indiuals in the study are members of the '\n",
            " 'same family; eat the same diet, or have been exposed to the same '\n",
            " 'environmental factors.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "'Question 10 of 16: What is the purpose of boosting in regression?'\n",
            "\n",
            "\n",
            "('The principal difference between boosting and the committee methods such as '\n",
            " 'bagging discussed above, is that the base classifiers are trained in '\n",
            " 'sequence, and each base classifier is trained using a weighted form of the '\n",
            " 'data set in which the weighting coefficient associated with each data point '\n",
            " 'depends on the performance of the previous classifiers. In particular, '\n",
            " 'points that are misclassified by one of the base classifications are given '\n",
            " 'greater weight when used to train the next classifier in the sequence. 12 '\n",
            " 'FIG. 12. This is a method for starting with a simple classifier and '\n",
            " 'gradually improving it by refitting the data giving higher weight to '\n",
            " 'misclassified samples, which can be extended to handle a variety of loss '\n",
            " 'functions, including for regression, robust regression, Poisson regression, '\n",
            " 'etc In this section, we shall present this statistical inter-boost pretation '\n",
            " 'of boosting, drawing on the reviews in FIG. DEihlmann and FIG. chlmhorn and, '\n",
            " 'which should be consulted for further details, and which is referred to '\n",
            " 'further details.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "'Question 11 of 16: What is the purpose of boosting in classification?'\n",
            "\n",
            "\n",
            "('The principal difference between boosting and the committee methods such as '\n",
            " 'bagging discussed above, is that the base classifiers are trained in '\n",
            " 'sequence, and each base classifier is trained using a weighted form of the '\n",
            " 'data set in which the weighting coefficient associated with each data point '\n",
            " 'depends on the performance of the previous classifiers. In particular, '\n",
            " 'points that are misclassified by one of the base classifications are given '\n",
            " 'greater weight when used to train the next classifier in the sequence, e.g., '\n",
            " 'that e -1,1 and that each h is such that h 1,1. This can incorporate unequal '\n",
            " 'weights quite easily in most algorithms. For example, in constructing a '\n",
            " 'tree; we could replace the impurity measure with a weighted impurity '\n",
            " 'measure: original version of boosting; called FIG. No. 376<n> This Abstract '\n",
            " 'is submitted with the understanding that it only will be used to assist in '\n",
            " 'determining, from a cursory inspection, the nature and gist of the technical '\n",
            " 'disclosure as described in 37 CFR 1.72.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "('Question 12 of 16: Is it true that greedy forward selection always selects a '\n",
            " 'model with fewer features than Lasso regression?')\n",
            "\n",
            "\n",
            "('It is well known that subset selection can be excessively greedy, often '\n",
            " 'yielding poor results when compared to less aggressive strategies such as '\n",
            " 'the lasso or ridge regression. This invention shows that it is true that '\n",
            " 'greedy forward selection always selects a model with fewer features than '\n",
            " 'Ellipticalso regression, and its more aggressive cousin best-subset se- '\n",
            " 'lection, which penalizes the number of non zero coefficients J = Zk lak/o. '\n",
            " 'It is also shown that its simplicity this has been shown to be superior to '\n",
            " 'other heuristics known such as least squares boosting, adaptive rejection '\n",
            " 'sampling, and greedy search from the top of the lattice downwards, for a '\n",
            " 'moderate fraction of dominant variables in the model, the SO- called '\n",
            " 'saturated model: The simplicity of this method is shown as follows:<n><n> 0t '\n",
            " 'p, and then choosing k argmax E<n><n><n> where is the probability of being '\n",
            " 'the optimal action: pk I = max E <n><n> p de.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "('Question 13 of 16: How can a Support Vector Machine account for cases where '\n",
            " 'the data is not linearly separable in the input space?')\n",
            "\n",
            "\n",
            "('For cases where the data is not linearly separable in the input space, it is '\n",
            " 'often helpful to encourage sparsity ad-hoc methods for doing this, with '\n",
            " 'names such as brain damage Computers were devised in the 1990s; see e.g. '\n",
            " 'patent application Ser. No. 369<n> It is shown that there are at most data '\n",
            " 'points falling outside the insensitive tube, while at least v data points '\n",
            " 'are support vectors and so lie either on the tube or outside it. The use of '\n",
            " 'a support vector machine to solve a regression problem is illustrated using '\n",
            " 'the sinusoidal data set in FIG. 3 369 where x 3. In this section we assume '\n",
            " 'that Y is binary will be convenient to label the outcomes as S= 1 and +l '\n",
            " 'instead of 0 and<n> A 1 = confidence interval for the true error rate i8 L + '\n",
            " 'e where 32 8 n log n 2 log 2 n 2<n> where n=l<n> This section we consider a '\n",
            " 'class of linear classifiers called support vector machines.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "('Question 14 of 16: Does Lloyds algorithm for k-means clustering always find '\n",
            " 'a globally optimal solution?')\n",
            "\n",
            "\n",
            "('An iterative descent clus- tering method for k-means clustering in which the '\n",
            " 'loss function is guaranteed to decrease monotonically in each iteration '\n",
            " 'until convergence: We prove this separately for the assignment step and the '\n",
            " 'refitting step. When the result no longer changes, a local optimum has been '\n",
            " 'reached. The algorithm finds a local rather than a global opti- mum, the '\n",
            " 'results obtained will depend on the initial random) cluster as signment of '\n",
            " 'each observation in FIG. 1 of Conditional of FIG. 2. The assignment of '\n",
            " 'observations to clusters at any iteration is a perturbation of that for the '\n",
            " 'previous iteration, only a very small fraction of all possible assignments. '\n",
            " 'This algorithm converges to local optima which may be highly suboptimal when '\n",
            " 'compared to the global optimum in accordance with the following questions, '\n",
            " 'we show that this algorithm always find a globally optimal solution in terms '\n",
            " 'of a consensus of independent weak learners: The goal is to identify a small '\n",
            " 'subset that is likely to contain the optimal one; O at least a good '\n",
            " 'suboptimal one.')\n",
            "\n",
            "\n",
            "----------------\n",
            "\n",
            "('Question 15 of 16: Is it possible to reconstruct a dataset from the first '\n",
            " 'two principal components? If not, what else is required?')\n",
            "\n",
            "\n",
            "('It is not possible to reconstruct a dataset from the first two principal '\n",
            " 'components. If not, what else is required, for example for handling missing '\n",
            " 'inputs in decision trees, which can induce a similar partition to the chosen '\n",
            " 'variable at any given split. This method finds highly correlated features, '\n",
            " 'and can be thought of as learning a local joint model of the input: This has '\n",
            " 'the advantage over a generative model of not modeling the entire joint '\n",
            " 'distribution of inputs, but it has the disadvantage of being entirely ad '\n",
            " 'hoc. For example, in typical applications in biology; psychology; marketing; '\n",
            " 'and other domains, the linear model is at best an extremely rough '\n",
            " 'approximation to the data, and residual errors due to other unmeasured '\n",
            " 'factors are often very large. There are two versions: the sum-product form, '\n",
            " 'also known as the division algorithm, named after Contractfer and '\n",
            " 'Contractnoy 1990); and the belief updating form, which involves division, '\n",
            " 'also named after a company, Contracturitzen- Parkiegelhalter algorithm.')\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "application/javascript": [
              "download(\"download_b3afb56d-ebed-4603-8eed-31c77997ffdc\", \"qa_intro to ML_exported_main_v1_googlebigbirdpegasuslargebigpatent08.19.2021, 2301.txt\", 18293)"
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "Completed QA -  08.19.2021, 23-01\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "e46AMZxiVIWr"
      },
      "source": [
        "### question list two\n",
        "\n",
        "sometimes it's useful to create multiple lists of questions, separated by topic"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bd9bfNamgBJV",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "66b234c0-cacc-408c-d4b4-48b4cb45b177"
      },
      "source": [
        "question_set_2 = [\n",
        "                    \n",
        "]"
      ],
      "execution_count": 38,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "E4WgoL7WgDl8",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "d6016be9-0a44-41ff-994a-16698127da10"
      },
      "source": [
        "if len(question_set_2) > 0 and process_QA_list:\n",
        "    answer_questions(pipe, question_set_2, add_text=\"question_set_2_\",\n",
        "                    k_search=150, # decrease_if_crash\n",
        "                    export_txt=download_answers_txtfile)"
      ],
      "execution_count": 39,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OliWlJbQDzmq"
      },
      "source": [
        "---"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "99rbio1E3KE_"
      },
      "source": [
        "# <center> Relevant Document Search </center>\n",
        "\n",
        "- enter a topic, and the retriever will get you the best N results to check out for your topic\n",
        "- uses `DocumentSearchPipeline`, so searches the texts in the corpus but only completes *extractive* summarization\n",
        "- **Use case:** \"I want to know where a concept or term shows up in the course documents\"\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QUYHxaX6oHSz",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        },
        "outputId": "46fef7cb-28c6-476e-e868-6e18b0f4ebfd"
      },
      "source": [
        "from haystack.pipeline import DocumentSearchPipeline \n",
        "\n",
        "# del retriever\n",
        "# re-using from earlier \n",
        "retriever_search = retriever\n",
        "\n",
        "search_pipe = DocumentSearchPipeline(retriever_search)\n",
        "\n",
        "print(\"the URL containing text documents in search is: \\n\", URL_to_archive)"
      ],
      "execution_count": 40,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "the URL containing text documents in search is: \n",
            " https://www.dropbox.com/sh/65nm4ks3m1wlvgn/AACz3XyHxEZc1HaPTYaXAo0va?dl=1\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "E282IyIj_yCD"
      },
      "source": [
        "## single query\n",
        "\n",
        "**note** that the below uses the `print_documents()` function, a dictionary with a lot of other metadata is returned (*if you want to save/customize the response, just save the function output to a var*)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nVdzO2gi_zsR",
        "cellView": "form",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 548
        },
        "outputId": "33b07807-73ed-4f44-ec50-a94883a87300"
      },
      "source": [
        "rd_query = \"Gaussian Mixture Models\" #@param {type:\"string\"}\n",
        "num_results =  3#@param {type:\"integer\"}\n",
        "search_result = search_pipe.run(query=rd_query, \n",
        "                                top_k_retriever=num_results)\n",
        "print_documents(search_result, max_text_len=512)\n",
        "\n",
        "# print(search_result)"
      ],
      "execution_count": 41,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "Query: Gaussian Mixture Models\n",
            "\n",
            "{   'name': 'mlpp_1811_mixture_models_and_the_em_algorith.txt',\n",
            "    'text': '0.8 0.2 0.3 0.6 0.8 0.9 (a) (b) Figure 11.3 A mixture of 3 '\n",
            "            'Gaussians in 2d. (a) We show the contours of constant probability '\n",
            "            'for each component in the mixture. () A surface plot of the '\n",
            "            'overall density Based on Figure 2.23 of (Bishop 2006a). Figure '\n",
            "            'generated by mixGaussPlotDemo. 11.2.1 Mixtures of Gaussians The '\n",
            "            'most widely used mixture model is the mixture of Gaussians (MOG), '\n",
            "            'also called a Gaussian mixture model or GMM In this model, each '\n",
            "            'base distribution in the mixture is a multivariate Gaussian with '\n",
            "            'mean ...'}\n",
            "\n",
            "{   'name': 'PRML_139_mixture_models_and_e.txt',\n",
            "    'text': '9.3.3 Mixtures f Bernoulli distributions So far in this chapter; '\n",
            "            'we have focussed on distributions over continuous vari- ables '\n",
            "            'described by mixtures of Gaussians. As a further example of '\n",
            "            'mixture mod- elling, and to illustrate the EM algorithm in a '\n",
            "            'different context; we now discuss mix- tures of discrete binary '\n",
            "            'variables described by Bernoulli distributions. This model is '\n",
            "            'also known as latent class analysis (Lazarsfeld and Henry, 1968; '\n",
            "            'McLachlan and Peel, 2000).'}\n",
            "\n",
            "{   'name': 'ESL__86_kernel_smoothing_method.txt',\n",
            "    'text': 'The Gaussian mixture model has the form M f (a) = @m O(x; pm , Zm '\n",
            "            '(6.32) m=1 with mixing proportions @m , @m = 1, and each Gaussian '\n",
            "            'density has m a mean m and covariance matrix Zm In general, '\n",
            "            'mixture models can use any component densities in place of the '\n",
            "            'Gaussian in (6.32): the Gaussian mixture model is by far the most '\n",
            "            'popular: The parameters are usually fit by maximum likelihood, '\n",
            "            'using the EM algorithm as described in Chapter &.'}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "cellView": "form",
        "id": "3jGW7zwRotZ6",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 446
        },
        "outputId": "fcf54290-15d4-4d60-e585-b8a3adce6429"
      },
      "source": [
        "rd_query = \"generative adversarial network\" #@param {type:\"string\"}\n",
        "num_results =  3#@param {type:\"integer\"}\n",
        "search_result = search_pipe.run(query=rd_query, \n",
        "                                top_k_retriever=num_results)\n",
        "print_documents(search_result, max_text_len=512)\n",
        "\n",
        "# print(search_result)"
      ],
      "execution_count": 42,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "Query: generative adversarial network\n",
            "\n",
            "{   'name': 'ISL2_1311_survival_analysis_and_censored_dat.txt',\n",
            "    'text': 'survdata () function, which is part of the coxed library: Our '\n",
            "            'simulated data will rep- resent the observed wait times (in '\n",
            "            'seconds) for 2,000 customers who have phoned a call center. In '\n",
            "            'this context, censoring occurs if a customer hangs up before his '\n",
            "            'or her call is answered. There are three covariates: Operators '\n",
            "            '(the number of call center operators available at the time of the '\n",
            "            'call, which can range from 5 to 15), Center (either A, B; or C), '\n",
            "            'and Tine of day (Morning; Afternoon; or Evening).'}\n",
            "\n",
            "{   'name': 'ISL2_1311_survival_analysis_and_censored_dat.txt',\n",
            "    'text': 'effect data The \"observed\" data is stored in queuingsdata, with y '\n",
            "            'corresponding to the event time and failed an indicator of '\n",
            "            'whether the call was answered (failed = T) or the customer hung '\n",
            "            'up before the call was answered (failed = F)_ We see that almost '\n",
            "            '90% of calls were answered.'}\n",
            "\n",
            "{   'name': 'ISL2_1412_unsupervised_learnin.txt',\n",
            "    'text': 'For this data, complete and average linkage generally separate '\n",
            "            'the observations into their correct groups. However, single '\n",
            "            'linkage identifies one point as belonging to its own cluster. A '\n",
            "            'more sensible answer is obtained when four clusters are selected; '\n",
            "            'although there are still two singletons.'}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "F3M_i2VSDs8y"
      },
      "source": [
        "---"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zuTQrNSWoH0F"
      },
      "source": [
        "# <center> Summarized Document Search </center>\n",
        "\n",
        "- enter a topic, and the pipeline will get you a summarized version of the best N results for that topic\n",
        "- **Use case:** \"I want to know the general definition of a concept or term\"\n",
        "\n",
        "\n",
        "## details\n",
        "- uses `SearchSummarizationPipeline`\n",
        "- summarization [options](https://huggingface.co/models?filter=summarization) (*to add a new one, just edit the dropdown*)\n",
        "- in general, the `bigbird` model summaries are the best as they are for longer documents. Then comes the `longformer`, then \"standard\" `pegasus` models\n",
        "    - details on the [longformer](https://huggingface.co/allenai/led-large-16384) by allenai"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FviP58rPBDPp",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "cellView": "form",
        "outputId": "e4e8e9b1-436c-4684-d5ab-fdf36b17f367"
      },
      "source": [
        "sum_model = \"allenai/led-large-16384-arxiv\" #@param [\"google/bigbird-pegasus-large-bigpatent\", \"facebook/bart-large-cnn\", \"google/pegasus-reddit_tifu\", \"google/pegasus-xsum\", \"allenai/led-large-16384-arxiv\", \"allenai/led-large-16384\"]\n",
        "\n",
        "# decrease_if_crash (in this case, change)"
      ],
      "execution_count": 43,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "kHpqZ9SDefFL"
      },
      "source": [
        "## model config \n",
        "\n",
        "- important to define some parameters for the summarizer, otherwise the summarized response may repeat itself a bunch of times, etc.\n",
        "- see [transformers docs](https://huggingface.co/transformers/main_classes/configuration.html#transformers.PretrainedConfig.from_pretrained) for more detail on how this works\n",
        "-  [hf LED api docs](https://huggingface.co/transformers/model_doc/led.html)\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "knHHDggu45tF",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "707abc7e-c7e5-48b2-b1d8-8318f9bd9fb4"
      },
      "source": [
        "from transformers import AutoModel\n",
        "\n",
        "def customize_pegasus(hf_model_name, outfolder=\"custom_pegasus\"):\n",
        "    # probably will work for all pegasus-based models. need to test others\n",
        "\n",
        "    b_size =  512 # decrease_if_crash\n",
        "    num_random_blocks=3 # bigbird only\n",
        "    block_size=64 # bigbird only\n",
        "    max_l =  512\n",
        "    n_beams =  12 # decrease_if_crash\n",
        "    len_penalty =  1\n",
        "    rep_penalty =  3.01\n",
        "    no_rpt_ngram = 2 \n",
        "    num_return_sequences=1 \n",
        "    # experiment on block_size and num_random_blocks\n",
        "\n",
        "\n",
        "    PEGASUS_param_names = [\"Input Max No. Tokens (Batch)\", \"num_random_blocks\", \n",
        "                        \"block_size\", \"max_length\", \"num_beams\", \"length_penalty\",\n",
        "                        \"no_repeat_ngram_size\", \"num_return_sequences\",\n",
        "                        \"repetition_penalty \"]\n",
        "    PEGASUS_params = [b_size, num_random_blocks, block_size, max_l, \n",
        "                    n_beams, len_penalty, no_rpt_ngram, num_return_sequences,\n",
        "                    rep_penalty]\n",
        "\n",
        "    custom_bb = AutoModel.from_pretrained(hf_model_name, \n",
        "                                max_length=max_l, \n",
        "                                num_beams=n_beams, \n",
        "                                length_penalty=len_penalty,\n",
        "                                num_return_sequences=1,\n",
        "                                no_repeat_ngram_size=no_rpt_ngram,\n",
        "                                repetition_penalty=rep_penalty,\n",
        "                                gradient_checkpointing=True, # slower but save CUDA\n",
        "                                max_position_embeddings=4096,\n",
        "                                )\n",
        "    save_path = join(os.getcwd(), outfolder)\n",
        "    custom_bb.save_pretrained(save_path)\n",
        "\n",
        "    print(\"successfully created customized summarizer model\")\n",
        "\n",
        "    return save_path\n",
        "\n",
        "\n",
        "def customize_LED(hf_model_name, outfolder=\"custom_LED\"):\n",
        "    # allenai/led-base-16384 / allenai/led-large-16384\n",
        "\n",
        "    n_beams =  12 # decrease_if_crash\n",
        "    len_penalty =  3.5\n",
        "    rep_penalty =  3.01\n",
        "    no_rpt_ngram = 2 \n",
        "    num_return_sequences=1 \n",
        "    activation=\"gelu_new\" # \"gelu\" is default, \"gelu_new\" also possible\n",
        "\n",
        "    custom_bb = AutoModel.from_pretrained(hf_model_name, \n",
        "                                activation_function=activation,\n",
        "                                num_beams=n_beams, \n",
        "                                length_penalty=len_penalty,\n",
        "                                num_return_sequences=1,\n",
        "                                no_repeat_ngram_size=no_rpt_ngram,\n",
        "                                repetition_penalty=rep_penalty,\n",
        "                                gradient_checkpointing=True, # slower but save CUDA\n",
        "                                )\n",
        "    save_path = join(os.getcwd(), outfolder)\n",
        "    custom_bb.save_pretrained(save_path)\n",
        "\n",
        "    print(\"successfully created customized summarizer model\")\n",
        "\n",
        "    return save_path\n"
      ],
      "execution_count": 44,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "BiwemyPRe5ED"
      },
      "source": [
        "## load model"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Kn66kSLoxXMq",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 372,
          "referenced_widgets": [
            "c99dab409dc749d19586ba4ad916926c",
            "9ba51990f0fb4ef2810109bfb7b3be45",
            "325282267da54dcbb427b075a9f26506",
            "fbd0f39d95c949a7869931832f0bab3f",
            "2580299a6032432ba4cdfc864dd4d4c3",
            "f43384a4565541738cfc1525420c6cfc",
            "2bd3944dce784d4ca6aeead28c29735d",
            "83334a864ba448049fc5db60a045cb32",
            "11fcaee382324c56965ab0ff0e01e029",
            "3c8ca6b17c724ec3b011992a62950506",
            "40e53a738fef436a8bc6a50aed072c88",
            "f20da18f698148ab882eb91acd94e674",
            "23781c0c818341d9862bbca739e0331a",
            "02b5d04ceee64b088100fc831341775d",
            "a345cf60f8e54ef7a60f98a8a5b014e8",
            "384a057a20eb49269a840d21cc14b283",
            "f0a759afe79245a295ae1fb606fd73cb",
            "827936436d434bc9a7f530fcc7822248",
            "e34fbfe2349e42af9b8b3562df245387",
            "3ceb4427b953473bb04daa6e937d27c8",
            "0d814d6976144172a94bb2eef832f47b",
            "630ef2da281440e9b26e60ff5d2cc501"
          ]
        },
        "outputId": "2278b0c5-5e28-4957-b7a1-3bdba0d80a36"
      },
      "source": [
        "# load summarizer\n",
        "from haystack.summarizer import TransformersSummarizer\n",
        "gc.collect()\n",
        "\n",
        "if \"pegasus\" in sum_model.lower():\n",
        "    # use the above function to set some parameters\n",
        "    print(\"using custom model parameters - PEGASUS\\n\\n\")\n",
        "    custom_model_path = customize_pegasus(sum_model)\n",
        "    model_source = custom_model_path\n",
        "elif \"led\" in sum_model.lower() and \"allenai\" in sum_model.lower():\n",
        "    print(\"using custom model parameters - Longformer LED\\n\\n\")\n",
        "    custom_model_path = customize_LED(sum_model)\n",
        "    model_source = custom_model_path\n",
        "else:\n",
        "    # loads straight from the hugginface hub\n",
        "    model_source = sum_model"
      ],
      "execution_count": 45,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "using custom model parameters - Longformer LED\n",
            "\n",
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "08/19/2021 23:01:44 - INFO - filelock -   Lock 140576337884560 acquired on /root/.cache/huggingface/transformers/2c634ad4bb0dbf33caec40da8aec7346f8c9ccaa102bc1e8b0ad9e114e5f448a.15537842a4ac1ff0dadd776344eea2b441b120b79b6770216ba98e6e42ccd4eb.lock\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "c99dab409dc749d19586ba4ad916926c",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "08/19/2021 23:01:44 - INFO - filelock -   Lock 140576337884560 released on /root/.cache/huggingface/transformers/2c634ad4bb0dbf33caec40da8aec7346f8c9ccaa102bc1e8b0ad9e114e5f448a.15537842a4ac1ff0dadd776344eea2b441b120b79b6770216ba98e6e42ccd4eb.lock\n",
            "08/19/2021 23:01:45 - INFO - filelock -   Lock 140576332796432 acquired on /root/.cache/huggingface/transformers/9ecdeb0e81951b50627b61aaa78d1a364d13217953860cb052130b12f2e053f9.94fc8d6760c34e39b663659ea41052aa573fc26562c1e8418e22b4e55dbf801f.lock\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "f20da18f698148ab882eb91acd94e674",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/1.84G [00:00<?, ?B/s]"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "08/19/2021 23:02:36 - INFO - filelock -   Lock 140576332796432 released on /root/.cache/huggingface/transformers/9ecdeb0e81951b50627b61aaa78d1a364d13217953860cb052130b12f2e053f9.94fc8d6760c34e39b663659ea41052aa573fc26562c1e8418e22b4e55dbf801f.lock\n",
            "Some weights of the model checkpoint at allenai/led-large-16384-arxiv were not used when initializing LEDModel: ['final_logits_bias', 'lm_head.weight']\n",
            "- This IS expected if you are initializing LEDModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing LEDModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": [
            "successfully created customized summarizer model\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0hztjKONcRoG",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 291
        },
        "outputId": "f2124db9-9642-48ae-e4f4-9161002fe201"
      },
      "source": [
        "%%capture\n",
        "summarizer = TransformersSummarizer(model_name_or_path=model_source,\n",
        "                                    tokenizer=sum_model,\n",
        "                                    min_length=64)"
      ],
      "execution_count": 46,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "08/19/2021 23:02:55 - INFO - filelock -   Lock 140573642385744 acquired on /root/.cache/huggingface/transformers/5ce67ebc34aa297017db8a2b3fddc582bda8bd6bae8951fcc2d4a0fc5d65e17b.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05.lock\n",
            "08/19/2021 23:02:55 - INFO - filelock -   Lock 140573642385744 released on /root/.cache/huggingface/transformers/5ce67ebc34aa297017db8a2b3fddc582bda8bd6bae8951fcc2d4a0fc5d65e17b.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05.lock\n",
            "08/19/2021 23:02:56 - INFO - filelock -   Lock 140573676703120 acquired on /root/.cache/huggingface/transformers/a50423ec3de34daea53de6a30ca96b60debdec4abdbc6a78fb7e2b145159b7eb.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock\n",
            "08/19/2021 23:02:56 - INFO - filelock -   Lock 140573676703120 released on /root/.cache/huggingface/transformers/a50423ec3de34daea53de6a30ca96b60debdec4abdbc6a78fb7e2b145159b7eb.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock\n",
            "08/19/2021 23:02:57 - INFO - filelock -   Lock 140573676701072 acquired on /root/.cache/huggingface/transformers/644496eac19670e4eae039a730bc2e3b219ecc44fcb1eb6827fe78d6590ee9bc.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0.lock\n",
            "08/19/2021 23:02:57 - INFO - filelock -   Lock 140573676701072 released on /root/.cache/huggingface/transformers/644496eac19670e4eae039a730bc2e3b219ecc44fcb1eb6827fe78d6590ee9bc.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0.lock\n",
            "08/19/2021 23:02:58 - INFO - filelock -   Lock 140574166556880 acquired on /root/.cache/huggingface/transformers/76d4e37013d7942047deb671fbb88ab3a515cbf9b8e0e758d715a4596e7664b9.cfc08f03f72cde495bd6b3dd3252bca130b3437de370856d084d1453c58b6fea.lock\n",
            "08/19/2021 23:02:58 - INFO - filelock -   Lock 140574166556880 released on /root/.cache/huggingface/transformers/76d4e37013d7942047deb671fbb88ab3a515cbf9b8e0e758d715a4596e7664b9.cfc08f03f72cde495bd6b3dd3252bca130b3437de370856d084d1453c58b6fea.lock\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "P-rTpMmxp_vl",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "9b242c42-d78c-4fda-fc11-84ce101b8139"
      },
      "source": [
        "# initialize summarize + search pipeline \n",
        "from haystack.pipeline import SearchSummarizationPipeline\n",
        "\n",
        "sumsearch_pipe = SearchSummarizationPipeline(summarizer, retriever_search)"
      ],
      "execution_count": 47,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Xm49XvWABcVG"
      },
      "source": [
        "## single query\n",
        "\n",
        "- `num_results` is how many documents it pulls to summarize. \n",
        "    - If the summarizer is not configured correctly, it will start repeating itself as the number gets higher\n",
        "    - increasing this parameter loads the GPU more"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DlZWhub2Bdmb",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 788
        },
        "cellView": "form",
        "outputId": "b50e316a-5f5f-4e7d-e334-acd1b06c2cd6"
      },
      "source": [
        "ss_query = \"gradient descent\" #@param {type:\"string\"}\n",
        "num_results =  50#@param {type:\"integer\"} # decrease_if_crash\n",
        "w_one_sum = True #@param {type:\"boolean\"}\n",
        "use_answer_fmt = True #@param {type:\"boolean\"}\n",
        "\n",
        "search_result = sumsearch_pipe.run(query=ss_query, \n",
        "                                top_k_retriever=num_results,\n",
        "                                generate_single_summary=w_one_sum,\n",
        "                                return_in_answer_format=use_answer_fmt,\n",
        "                                )\n",
        "# print_documents(search_result, max_text_len=256)\n",
        "answers_list = search_result[\"answers\"]\n",
        "count=0\n",
        "for entry in answers_list:\n",
        "\n",
        "    this_answer = clean_output(entry['answer'])\n",
        "    this_context = clean_output(entry['context'])\n",
        "    count += 1\n",
        "    print(\"\\n\\nItem #{} - answer\\n\".format(count))\n",
        "    pp.pprint(this_answer)\n",
        "    print(\"\\n the context (first 2k chars) is: \\n\")\n",
        "    pp.pprint(this_context[:2000]+\"...\")\n",
        "\n",
        "    new_row_sum = {'query':ss_query, \n",
        "                    'response':this_answer, \n",
        "                    'query_type':\"summary_search\", \n",
        "                    'doc_group':course_name,\n",
        "                    'model_name':sum_model,\n",
        "                    'context':this_context,\n",
        "                    }\n",
        "    info_queries = info_queries.append(new_row_sum, \n",
        "                                       ignore_index=True)\n"
      ],
      "execution_count": 48,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "\n",
            "\n",
            "Item #1 - answer\n",
            "\n",
            "('gradient descent is a first-order optimization algorithm : to find a local '\n",
            " 'minimum of a function using gradient descent; one takes steps propor- tional '\n",
            " 'to the near approximation in the steepest decent direction. it has to do '\n",
            " 'with implicit bias up and it was especially interesting because of the surge '\n",
            " 'of interest on. for this reason, the use of gradient information forms the '\n",
            " 'basis of practical algorithms for training neural networks 5.7.1 '\n",
            " 'Backpropagation how do we find the directions to move 0 SO as to decrease '\n",
            " 'the objective r(0) in (10.25)? 6.2 Gradient descent on a simple function '\n",
            " 'starting from (0,0), for 20 steps, using a fixed learning rate (step size ) '\n",
            " '7: The global minimum is at (1,1). 8: einen Moment(um) bittel = wt+1wt + Awt '\n",
            " 'with awt = yvt-1 n! l(')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('The valley when it actually surpasses it as one way to to take the testis '\n",
            " 'into account is a turnout to the opposite of what momentum does so what it '\n",
            " 'but it does rain now as it was presented we said we first compute the '\n",
            " 'gradient and at the current loc Action w t and then our cities so then. One '\n",
            " 'can visualize (Figure 10.17) standing in a mountainous terrain, and the goal '\n",
            " 'is to get to the bottom through a series of steps. As long as each step goes '\n",
            " 'downhill, we must eventually get to the bottom. In this case we were lucky; '\n",
            " 'because with our starting guess 00 we end up at the global minimum In '\n",
            " 'general we can hope to end up at a (good) local minimum 10.7.1 '\n",
            " 'Backpropagation How do we find the directions to move 0 SO as to decrease '\n",
            " 'the objective R(0) in (10.25) ? And indeed it was recently twenty, seventeen '\n",
            " 'twenty eighteen it was analyzed that a gradient descent actually finds the '\n",
            " 'direction of the maximum auto margin, which is the direction that maximizes '\n",
            " 'the auto distance of the point to the boundary it has to do with implicit '\n",
            " 'bias up and it was especially interesting because of the surge E of interest '\n",
            " 'on. Model fitting 247 0.5 (a) (b) Figure 8.2 Gradient descent on a simple '\n",
            " 'function starting from (0,0), for 20 steps, using a fixed learning rate '\n",
            " '(step size) 7: The global minimum is at (1,1). (a) n = 0.1. (6) n = 0.6. '\n",
            " \"Figure generated by steepestDescentDemo. And then in expectation, it's going \"\n",
            " 'to be the true gradient, but otherwise you will not without taking the '\n",
            " 'expectation you will not a find that you are definitely descending and so in '\n",
            " 'practice what you do it you just divide your entire training data set into '\n",
            " 'some subsets as once so and then you just a first do the first adoration be '\n",
            " 'using this. That sort of rolls down a is cars, landscape and and and '\n",
            " 'basically that a ball will not only go in the direction of steepest descent, '\n",
            " 'but it will build up a momentum instead of keep going in the direction after '\n",
            " 'it has been going right and so one way to implement this that is basica...')\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hzY_tJzzSR1w",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 788
        },
        "cellView": "form",
        "outputId": "d32e67f9-1c3f-419a-b71c-63e2b0eaa556"
      },
      "source": [
        "ss_query = \"support vector machine\" #@param {type:\"string\"}\n",
        "num_results =  50#@param {type:\"integer\"} # decrease_if_crash\n",
        "w_one_sum = True #@param {type:\"boolean\"}\n",
        "use_answer_fmt = True #@param {type:\"boolean\"}\n",
        "\n",
        "search_result = sumsearch_pipe.run(query=ss_query, \n",
        "                                top_k_retriever=num_results,\n",
        "                                generate_single_summary=w_one_sum,\n",
        "                                return_in_answer_format=use_answer_fmt,\n",
        "                                )\n",
        "# print_documents(search_result, max_text_len=256)\n",
        "answers_list = search_result[\"answers\"]\n",
        "count=0\n",
        "for entry in answers_list:\n",
        "\n",
        "    this_answer = clean_output(entry['answer'])\n",
        "    this_context = clean_output(entry['context'])\n",
        "    count += 1\n",
        "    print(\"\\n\\nItem #{} - answer\\n\".format(count))\n",
        "    pp.pprint(this_answer)\n",
        "    print(\"\\n the context (first 2k chars) is: \\n\")\n",
        "    pp.pprint(this_context[:2000]+\"...\")\n",
        "\n",
        "    new_row_sum = {'query':ss_query, \n",
        "                    'response':this_answer, \n",
        "                    'query_type':\"summary_search\", \n",
        "                    'doc_group':course_name,\n",
        "                    'model_name':sum_model,\n",
        "                    'context':this_context,\n",
        "                    }\n",
        "    info_queries = info_queries.append(new_row_sum, \n",
        "                                       ignore_index=True)\n"
      ],
      "execution_count": 49,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "\n",
            "\n",
            "Item #1 - answer\n",
            "\n",
            "('principal component score vectors are the predictors, and one of the '\n",
            " 'features of their data matrix is the response. in each regression, the first '\n",
            " 'five principal components contain all the informa- tion about the initial '\n",
            " 'response to a test observation. as noted in the chapter, much more efficient '\n",
            " 'implementations of re- sampling approach to FDR calculation are available ; '\n",
            " 'using eg: the samr package in r: 13.7 Exercises Conceptual 1.2.5 xlab Nunber '\n",
            " 'of Rejections type \"1\" ylab False Discovery Rate \" col 4, 1wd 3 ) [ 1] 0. '\n",
            " '[12 ] <PHONE> <PHONE> <PHONE> <PHONE> <PHONE> [ 23] <PHONE> <PHONE> 2087 '\n",
            " '2159 we find that differences between centers are highly significant, as are '\n",
            " 'differences for times of day. there are two common ways to randomly split a '\n",
            " 'data set into equal- sized training and test sets')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('P514 12_ Unsupervised Learning C) 9 M L 1 2 ] 9 9 3 -2 ~1 2 1 0 8 L 2 3 2 8 '\n",
            " '10 12 True First Principal Component True PC Variances FIGURE 12.6 As '\n",
            " 'described in the tert, in each of 100 trials, we left out 20 elements of the '\n",
            " 'USArrests dataset. In each trial, we applied Algorithm 12.1 with M = 1 to '\n",
            " 'impute the missing elements and compute the principal components. 1725 6 '\n",
            " '0275 sample estimates : mean of the differences 3.1 However, we decided to '\n",
            " 'perform this test only after examining the data and noting that Managers One '\n",
            " 'and Two had the highest and lowest mean performances. In a sense, this means '\n",
            " 'that we have implicitly performed = 5(5 5 1)/2 10 hypothesis tests; rather '\n",
            " 'than just one, as discussed in Section 13.3.2. The key idea is that often a '\n",
            " 'small number of principal components suffice to explain most of the '\n",
            " 'variability in the data, as well6.3 Dimension Reduction Methods 257 8 8 2 8 '\n",
            " '8 8 1 Squared Bias Test MSE Variance 8 2 8 8 } 8 { 8 2 10 20 30 40 10 20 30 '\n",
            " '40 Number of Components Number of Components FIGURE 6.18. PCR was applied to '\n",
            " 'two simulated data sets. In each panel, the horizontal dashed line '\n",
            " 'represents the irreducible error_ Left: Simulated data from Figure 6.8. The '\n",
            " 'purple dashed line represents the Bayes decision boundary: The orange '\n",
            " 'background grid indicates the region in which a test observation will be '\n",
            " 'assigned to the orange class, and the blue background grid indicates the '\n",
            " 'region in which a test observation will be assigned to the blue class. '\n",
            " 'corresponds to predicting class one if Pr(Y = 1X = %0) 0.5, and class two '\n",
            " 'otherwise. 8 1 8 2 10 20 30 40 0.0 0.2 0.4 0.6 0.8 1.0 Number of Components '\n",
            " 'Shrinkage Factor FIGURE 6.19_ PCR, ridge regression, and the lasso were '\n",
            " 'applied to a simulated data set in which the first five principal components '\n",
            " 'of X contain all the informa- tion about the response Y_ In each panel, the '\n",
            " 'irreducible error Var(e) is shown as @ horizontal dashed line. Left: Results '\n",
            " 'for PCR_ Right: Results for lasso (solid) and ridge regression (dotted)...')\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Ppx_dPrQe-D1"
      },
      "source": [
        "## query list of terms"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gnkGwlBGazgG"
      },
      "source": [
        "### custom search function"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4kY-l4ZXaRnX",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "b686e6ec-6250-4c05-94e8-94ac32601cd5"
      },
      "source": [
        "from google.colab import files\n",
        "import pprint as pp\n",
        "def search_for_terms(srch_pipe, st_list, k_search=50, one_answer=True,\n",
        "                     answer_fmt=True, export_txt=True, no_context_in_txt=True,\n",
        "                     add_text=\"\", doc_ext=\".txt\"):\n",
        "    # if you wanna just save as markdown change doc_ext=\".md\"\n",
        "    gc.collect()\n",
        "    st_document = []\n",
        "    global info_queries\n",
        "    spacer = \"\\n\\n~~~~~~~~~~~~~~~~~~\\n\"\n",
        "    for search_term in tqdm(st_list, total=len(st_list),\n",
        "                         desc=\"getting defs for search_terms...\"):\n",
        "        print(spacer)\n",
        "        \n",
        "        search_term_text = \"search_term {} of {}: {}\".format(st_list.index(search_term),\n",
        "                                                             len(st_list),\n",
        "                                                             search_term)\n",
        "        st_document.append(spacer)\n",
        "        st_document.append(\"\\n### \" + search_term_text + \" \\n+++++++\\n\") \n",
        "        pp.pprint(search_term_text)\n",
        "        print(\"\\n\")\n",
        "        search_out = srch_pipe.run(query=search_term, \n",
        "                                      top_k_retriever=k_search,\n",
        "                                      generate_single_summary=one_answer,\n",
        "                                      return_in_answer_format=answer_fmt,\n",
        "                                      )\n",
        "        search_answers = search_out[\"answers\"]\n",
        "        count=0\n",
        "        print(\"\\n+++++++\\n\")\n",
        "        for memo in search_answers:\n",
        "\n",
        "            this_answer = clean_output(memo['answer'])\n",
        "            this_context = clean_output(memo['context'])\n",
        "            count += 1\n",
        "            this_answer_header = \"\\nmodel description: #{} of {}\\n\".format(count,\n",
        "                                                                           len(answers_list))\n",
        "            print(this_answer_header)\n",
        "            pp.pprint(this_answer)\n",
        "            this_context_header = \"\\n the context (first 2k chars) is: \\n\"\n",
        "            print(this_context_header)\n",
        "            pp.pprint(this_context[:2000]+\"...\")\n",
        "            if no_context_in_txt: this_context = \"see .xlsx for context\\n\"\n",
        "            st_document.extend([\"\\n\" + this_answer_header, this_answer,\n",
        "                               this_context_header, this_context + \"\\n\" ])\n",
        "            \n",
        "            new_row_sum = {'query':search_term, \n",
        "                            'response':this_answer, \n",
        "                            'query_type':\"summary_search\", \n",
        "                            'doc_group':course_name,\n",
        "                            'model_name':sum_model,\n",
        "                            'context':this_context,\n",
        "                            }\n",
        "            info_queries = info_queries.append(new_row_sum, \n",
        "                                            ignore_index=True)\n",
        "\n",
        "\n",
        "    date_time = datetime.now().strftime(\"%m.%d.%Y, %H-%M\")\n",
        "    this_outname = remove_string_extras(\"SummarySearch-Terms_{}_exported_{}\".format(course_name, add_text) + sum_model + date_time) + doc_ext\n",
        "    with open(this_outname, 'w', encoding='utf-8', errors='ignore') as ss_f:\n",
        "        ss_f.writelines(st_document)\n",
        "    download_file(this_outname)\n",
        "\n",
        "    print(\"\\nCompleted Summary Search - \", date_time)"
      ],
      "execution_count": 50,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bh8Pjm1Td-TI",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "d8a2c7be-9b1e-4945-daf9-c3848cfe5601"
      },
      "source": [
        "# keep list structure, change terms as needed\n",
        "my_terms = [\n",
        "\n",
        "            'Regression',\n",
        "            'Model Selection',\n",
        "            'Optimization',\n",
        "            'Classification',\n",
        "            'Other Model Metrics',\n",
        "            'Kernels',\n",
        "            'Neural Networks',\n",
        "            'Clustering',\n",
        "            'Dimensionality Reduction',\n",
        "            'Decision Theory',\n",
        "            'Maximum Likelihood',\n",
        "            'Bootstrapping',\n",
        "            'Bayesian Viewpoint',\n",
        "            'Gaussian Mixture Models',\n",
        "            'Beans',\n",
        "            'Generative Adversarial Networks',\n",
        "]"
      ],
      "execution_count": 51,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LQNc__1AAm2I",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "da92b044-796a-4a21-c449-6735e02d2337"
      },
      "source": [
        "# some terms are commented out for the sake of the example\n",
        "# the more terms are searched, the more CUDA memory is used,\n",
        "# the more likely it is to crash and not finish.\n",
        "\n",
        "# use small lists, and several of them. this allows for \"checkpoints\" (i.e.\n",
        "# it saves a text file )\n",
        "\n",
        "PR_ML = [\n",
        "                \n",
        "            'gaussian distribution',\n",
        "            'log likelihood function',\n",
        "            'linear regression model',\n",
        "            'graphical models',\n",
        "            'latent variables',\n",
        "            'conditional mixture models',\n",
        "            'mixture density network',\n",
        "            'log marginal likelihood',\n",
        "            'posterior probability',\n",
        "            # 'relevance vector machine',\n",
        "            # 'directed graph',\n",
        "            # 'bayesian model',\n",
        "            # 'multivariate gaussian',\n",
        "            # 'synthetic data points',\n",
        "            # 'probabilistic pca',\n",
        "            # 'sampling',\n",
        "            # 'markov chain',\n",
        "            # 'output unit activation',\n",
        "\n",
        "]\n",
        "\n",
        "past_exams = [\n",
        "              \n",
        "    'confusion matrix tolerance',\n",
        "    'empirical risk minimizer',\n",
        "    # 'neural network architecture',\n",
        "    # 'mle probability',\n",
        "    # 'basis function kernel',\n",
        "    # 'pca objective',\n",
        "    # 'activation functions',\n",
        "    # 'outputs valid class',\n",
        "    'component analysis pca',\n",
        "    'likelihood estimate mle',\n",
        "    'pca',\n",
        "    'weighted empirical risk',\n",
        "    'convergence of algorithm',\n",
        "]\n",
        "\n",
        "# if more lists added, ensure that the below is updated (see other comment)"
      ],
      "execution_count": 52,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IbopeVKyqfw-"
      },
      "source": [
        "## Run List of Search Terms\n",
        "\n",
        "- depending on database size, number of questions, so on, may need to run the QA list and search term list separately "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KbFUqxvb89q8",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "cellView": "form",
        "outputId": "757c752c-ac56-4fdc-d319-a4b5e3f81e0d"
      },
      "source": [
        "run_search_term_list = True #@param {type:\"boolean\"}\n",
        "num_docs_pl =  50#@param {type:\"integer\"}"
      ],
      "execution_count": 53,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XpHGprmge-8Y",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000,
          "referenced_widgets": [
            "e0cab22ea8614f858bba3157f5284929",
            "f3adb2e4e92941b5a63914f733de52c8",
            "344b107cd6004274945f5e9fb2db8d37",
            "9299178358ed4ae580fd75fe27e10eeb",
            "5d073a3773e64b7b8153281784ab7d4a",
            "79ada5619ae549dc886074c5d0b079ba",
            "1d62d04c488a4aceb3597550ccd606fc",
            "9e4c97eb18c44e7cad98f19be5722bf1",
            "ed4425bde1b74eb2972526e0d00fdadf",
            "59e7ff0a4102444c858235995750a252",
            "4861e090d5124fe7a19e9d0ccd7e669e"
          ]
        },
        "outputId": "9c3097f4-50ae-43de-9ac4-2227c28e45dc"
      },
      "source": [
        "if run_search_term_list:\n",
        "    # iterates through a primary list of terms in `my_terms`\n",
        "    search_for_terms(sumsearch_pipe, my_terms,\n",
        "                     k_search=num_docs_pl, # decrease_if_crash\n",
        "                     add_text=\"_main_{}_\".format(questions_version),\n",
        "                     export_txt=False)"
      ],
      "execution_count": 54,
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "e0cab22ea8614f858bba3157f5284929",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "getting defs for search_terms...:   0%|          | 0/16 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "metadata": {
            "tags": null
          },
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 0 of 16: Regression'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('in this chapter we discuss linear and nonparametric regression for a variety '\n",
            " 'of situations. the purpose is to provide an introduction to these topics '\n",
            " 'which will be useful for those who are interested in learning more about '\n",
            " 'statistical inference, or want to apply them to their own problems. + * key '\n",
            " 'words : * machine learning, regression, multiple output, selection, '\n",
            " 'shrinkage, logistic regression, bayesian regressions, randomness, monte - '\n",
            " 'carlo ( mccarthy ) method, parametric assumptions, idealization, '\n",
            " 'bibliographic Remarks, kirkpatrick school of mathematical sciences, lawrence '\n",
            " 'berkeley institute of technology, university of california, santa barbara, '\n",
            " 'ca 93106 - 0.5 in _ pacs number(s ) : 05.40.-a, 89.65.+e, 02.30.jv, '\n",
            " '07.05.fh, 87.10.gx,04.80.pq # 1')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('In Chapters 20 and 21 we discuss nonparametric regression: 13.1 Simple '\n",
            " 'Linear Regression The simplest version of regression is when Xi is simple '\n",
            " '(one-dimensional) and r(x) is assumed to be linear: r(x) = Bo + B1x. 1 The '\n",
            " \"term 'regression' is due to Sir Francis Galton (<PHONE>) who noticed that \"\n",
            " 'tall and short men tend to have sons with heights closer to the mean_ He '\n",
            " 'called this \"regression towards the mean. 210 13. Linear and Logistic '\n",
            " 'Regression 9 4 7 L 2 1 Y 2 7 8 4.0 4.5 5.0 5.5 log light intensity (X) '\n",
            " 'FIGURE 13.1. 2001, p70). The reason is that in PC regression, only the first '\n",
            " 'K (derived) dimensions are retained, and the remaining D K dimensions are '\n",
            " 'entirely ignored. Figure &.1 shows the data and a histogram of the bootstrap '\n",
            " 'replications 0f_ B This histogram is an approximation to the sampling '\n",
            " 'distribution of 0. The Normal-based 95 percent confidence interval is 78 + '\n",
            " '2se (.51,1.00) while the percentile interval is (.46,.96) In large samples, '\n",
            " 'the two methods will show closer agreement. 8.7 Example_ This example is '\n",
            " 'from Efron and Tibshirani (1993). When drug companies introduce new '\n",
            " 'medications, they are sometimes required to show bioequivalence. 84 3 Linear '\n",
            " 'Methods for Regression 3.7 Multiple Outcome Shrinkage and Selection As noted '\n",
            " 'in Section 3.2.4, the least squares estimates in a multiple-output linear '\n",
            " 'model are simply the individual least squares estimates for each of the '\n",
            " 'outputs. To apply selection and shrinkage methods in the multiple output '\n",
            " 'case one could apply a univariate technique individually to each outcome Or '\n",
            " 'si- multaneously to all outcomes. (2007) _ 3.5 Methods Using Derived Input '\n",
            " 'Directions In many situations we have a large number of inputs; often very '\n",
            " 'correlated: The methods in this section produce a small number of linear '\n",
            " 'combinations Zm, m = 1 M of the original inputs Xj; and the Zm are then used '\n",
            " 'in place of the Xj as inputs in the regression. Also on the plot is an '\n",
            " 'estimated linear regression line which will be explained shortly: The '\n",
            " 'unknown para...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 1 of 16: Model Selection'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('cross-validation the wrong and right way : histograms shows the correlation '\n",
            " 'of class labels, in 10 randomly chosen samples, with the 100 predic- tors '\n",
            " 'chosen using the incorrect ( upper red) and correct ( lower green) versions '\n",
            " 'of cross - validation. this is supported by an extensive empirical '\n",
            " 'comparison of 10 different classifiers in (Caruana and Niculescu-Mizil 2006 '\n",
            " '), who showed that boosted decision trees were the best both in terms of '\n",
            " 'misclassification error and in producing well-calibrated probabilities, as '\n",
            " 'judged by rOC curves. (The second best method was random forests, invented '\n",
            " 'by Breiman ; see section 16.2.5.) in each regression, the principal '\n",
            " 'component score vectors are the predictors, and one of the features of a '\n",
            " 'data matrix is the response. let <CUR> = {S1,. = Sm} denote a set of models. '\n",
            " 'suppose we assign the prior P( S;) 1/m over the models.')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('[This question was suggested by Li Ma:] 10-e 260 Model Assessment and '\n",
            " 'Selection This is supported by an extensive empirical comparison of 10 '\n",
            " 'different classifiers in (Caruana and Niculescu-Mizil 2006), who showed that '\n",
            " 'boosted decision trees were the best both in terms of misclassification '\n",
            " 'error and in terms of producing well-calibrated probabilities, as judged by '\n",
            " 'ROC curves. (The second best method was random forests, invented by Breiman; '\n",
            " 'see Section 16.2.5.) In each regression, the principal component score '\n",
            " 'vectors are the predictors, and one of the features of the data matrix is '\n",
            " 'the response. (13.31) 2 3Some texts use a slightly different definition of '\n",
            " 'AIC which involves multiplying the definition here by 2 or -2 This has no '\n",
            " 'effect on which model is selected 13.6 Model Selection 221 The BIC score has '\n",
            " 'a Bayesian interpretation. Let <CUR> = {S1,. = Sm} denote a set of models. '\n",
            " 'Suppose we assign the prior P( S;) 1/m over the models. Choosing these '\n",
            " 'parameters is an example of model selection: We discuss some approaches '\n",
            " 'below. 11.5.1 Model selection for probabilistic models The optimal Bayesian '\n",
            " 'approach, discussed in Section 5.3, is to pick the model with the largest '\n",
            " 'marginal likelihood, K 3 argmaxk P(DIK) There are two problems with this. '\n",
            " 'First, evaluating the marginal likelihood for LVMs is quite difficult In '\n",
            " 'practice, simple approximations, such as BIC, can be used (see e.g;, (Fraley '\n",
            " 'and Raftery 2002)). 2 For each fold k = 1,2, K 246 Model Assessment and '\n",
            " 'Selection Wrong way 8 8 [ 1 9 0 -1.0 -0.5 0.0 0.5 1.0 Correlations of '\n",
            " 'Selected Predictors with Outcome Right way 8 R [ 9 -1.0 -0.5 0.0 0.5 1.0 '\n",
            " 'Correlations of Selected Predictors with Outcome FIGURE 7.10. '\n",
            " 'Cross-validation the wrong and right way: histograms shows the correlation '\n",
            " 'of class labels, in 10 randomly chosen samples, with the 100 predic- tors '\n",
            " 'chosen using the incorrect (upper red) and correct (lower green) versions of '\n",
            " 'cross-validation. For the curves labeled \"Sample = 0.5 @ different 50 % '\n",
            " 'subsample of the tr...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 2 of 16: Optimization'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('this chapter describes the basic numerical methods for training ma- chine '\n",
            " 'learning models Training a machine learning model often boils down to '\n",
            " 'finding a good set of parameters : the notion of \"good\" is de- ( termined by '\n",
            " 'the probabilistic model, which we will see examples of in the second part of '\n",
            " 'this book given an objective function, finding the best value is done using '\n",
            " 'optimization algorithms. continuous optimization stepsize unconstrained '\n",
            " 'optimization Gradient descent Momentum stochastic gradient descent convex '\n",
            " 'optimization duality linear programming convex conjugate quadratic '\n",
            " 'programming classification since this is a cubic equation, it has in general '\n",
            " 'three solutions when set to zero. @xmath0 <CUR> ] # 1#2#3#4#1 * # 2 *, # 3 ( '\n",
            " '# 4 ) # 5(#6)#7[#8]#9#10#11#12#13#14#15#16#17#18#19#')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('In machine learning, optimization methods are used for training by min- '\n",
            " 'imizing an objective function 0n the training data, but the overall goal is '\n",
            " 'to improve generalization performance (Chapter 8). Since the goal in machine '\n",
            " 'learning does not necessarily need a precise estimate of the min- imum of '\n",
            " 'the objective function, approximate gradients using mini-batch approaches '\n",
            " 'have been widely used, Stochastic gradient descent is very effective in '\n",
            " 'large-scale machine learning problems (Bottou et al,, 2018), Draft '\n",
            " '(2021-01-14) of \"Mathematics for Machine Learning\". Feedback: https '\n",
            " '/mml-book. Continuous Optimization Since machine learning algorithms are '\n",
            " 'implemented on a computer; the mathematical formulations are expressed as '\n",
            " 'numerical optimization meth- ods. This chapter describes the basic numerical '\n",
            " 'methods for training ma- chine learning models Training a machine learning '\n",
            " 'model often boils down to finding a good set of parameters: The notion of '\n",
            " '\"good\" is de- ( termined by the objective function or the probabilistic '\n",
            " 'model, which we will see examples of in the second part of this book Given '\n",
            " 'an objective function, finding the best value is done using optimization '\n",
            " 'algorithms. 226 Continuous Optimization Figure 7.1 A mind map of the '\n",
            " 'concepts related to optimization, as presented in this chapter: There are '\n",
            " 'two main ideas: gradient descent and convex optimization: Continuous '\n",
            " 'optimization Stepsize Unconstrained optimization Gradient descent Momentum '\n",
            " 'Stochastic gradient descent Constrained optimization Chapter 10 Dimension '\n",
            " 'reduc Lagrange multipliers Chapter 11 Density estimation Convex Convex '\n",
            " 'optimization duality Linear programming Convex conjugate Quadratic '\n",
            " 'programming Chapter 12 Classification Since this is a cubic equation, it has '\n",
            " 'in general three solutions when set to zero. In the example, two of them are '\n",
            " 'minimums and one is a maximum (around I = -1.4). 2005; Deisenroth et al. '\n",
            " '2009), global optimization of non-convex functions (Mockus et al 1996; '\n",
            " 'Lizotte 2008; Brochu et al. 20...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 3 of 16: Classification'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('fkj(xj ) is a classification method that partition the covariate space into '\n",
            " 'disjoint pieces and then classify the observations according to which '\n",
            " 'partition element they fall in : as the name implies, the classifier can be '\n",
            " 'represented as a tree for illustration, suppose there are two covariates, x1 '\n",
            " 'age and x2 blood pressure. the tree is used in the following way. if we only '\n",
            " 'had data on X and y we would conclude that x and Y are associated. next we '\n",
            " 'might encourage everyone to take vitamin c. Suppose we wrongly interpret '\n",
            " 'this causally and concluded that vitamin C prevents illness. 22.7 trees '\n",
            " 'Trees are classification methods that partitions the covariance space ( x ) '\n",
            " 'into three or more dis- joint pieces such that each observation falls in one '\n",
            " 'of these parts. when x is high-dimensional and dis - crete_ In that case, it '\n",
            " 'is especially simple to use the naive bayes classifiers ; e.g. @xmath0')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('For example, in the animal dataset; we may want to group the animals on the '\n",
            " 'basis of anatomical features (e.g; mammals are warm blooded; reptiles are '\n",
            " 'not), Or on the basis of behavioral features (e.g;, predators VS prey): We '\n",
            " 'now present a model that can capture this phenomenon: This model was '\n",
            " 'indepen- dently proposed in (Shafto et al. 2006; Mansinghka et al 2011), who '\n",
            " 'call it crosscat (for cross- categorization), and in (Guan et al. 2010; Cui '\n",
            " 'et al. 354 22 Classification simplification occurs if we assume that Zo = Zo '\n",
            " '= Z. Coefficient Std: error 0.18 0.36 0.92 0.64 2.15 0.45 0.89 0.66 0.44 '\n",
            " '0.70 -0.05 0.02 0.03 0.02 0.18 0.60 z-statistic p-value 0.51 0.61 1.43 0.15 '\n",
            " '4.78 0.00 1.35 0.18 0.63 0.53 -3.00 <0.01 1.54 0.12 0.30 0.77 sex [Male] '\n",
            " 'diagnosis [LG Glioma] diagnosis [HG Glioma] diagnosis [Other] loc '\n",
            " \"[Supratentorial] ki gtv stereo [SRT] TABLE 11.2. Results for Cox'<CUR> \"\n",
            " 'proportional hazards model fit to the BrainCancer data; which WaS first '\n",
            " 'described in Section 11.3. The variable diagnosis is qualitative with four '\n",
            " 'levels: meningioma, LG glioma, HG glioma, or other. The naive Bayes '\n",
            " 'classifier is popular when x is high-dimensional and dis- crete_ In that '\n",
            " 'case, fkj(xj) is especially simple. 22.7 Trees Trees are classification '\n",
            " 'methods that partition the covariate space X into disjoint pieces and then '\n",
            " 'classify the observations according to which partition element they fall in: '\n",
            " 'As the name implies, the classifier can be represented as a tree For '\n",
            " 'illustration, suppose there are two covariates, X1 age and X2 blood pressure '\n",
            " 'Figure 22.2 shows a classification tree using these variables. The tree is '\n",
            " 'used in the following way. If we only had data on X and Y we would conclude '\n",
            " 'that X and Y are associated. Suppose we wrongly interpret this causally and '\n",
            " 'conclude that vitamin C prevents illness. Next we might encourage everyone '\n",
            " 'to take vitamin C. If most people comply with our advice, the population '\n",
            " 'will look something like this: X Y Co C1 0* G 0* 0* 1 1* 17 ; 1 1 Now a '\n",
            " '(4/7) ~ (0/1...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 4 of 16: Other Model Metrics'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('factor analysis is a classical technique developed in the statistical liter- '\n",
            " 'ature that aims to identify these latent sources. Factor analysis models are '\n",
            " 'typically wed to gaussian distributions ; which has to some extent hin- '\n",
            " 'dered their usefulness. however, for many adaptive; nonlinear techniques '\n",
            " '(like trees ), estimation of the effective number of parameters is very '\n",
            " 'difficult_ this makes methods like tree-based models impractical and leaves '\n",
            " 'us with cross-validation or bootstrap as the methods of choice. subsection '\n",
            " '14.12.5 gives a relatively simple bound for the entropy involving a '\n",
            " 'logarithmic term, but that one then possibly pays a price in terms of '\n",
            " 'constants : @xmath0 <CUR> ] = -1/1 y _ f(x)<CUR> ] # 1#2#3#4#1 * # 2 *, # 3 '\n",
            " '( # 4 ) [ # 5]#6#7#8#9#10#11#12#13#14#')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('The second term is the average code length for transmitting the model '\n",
            " 'parameters 0 while the first term is the average code length for '\n",
            " 'transmitting the discrepancy between the model and actual target values. '\n",
            " 'Quantifying uncertainty requires the idea of a random variable, which is a '\n",
            " 'function that maps outcomes of random experiments to a set of properties '\n",
            " 'that we are interested in. Associated with the random variable is a function '\n",
            " 'that measures the probability that a particular outcome (or set of outcomes) '\n",
            " 'will occur; this is called the probability distribution. Probability '\n",
            " 'distributions are used as a building block for other con- cepts, such as '\n",
            " 'probabilistic modeling (Section 8.4) , graphical models (Sec- tion 8.5) , '\n",
            " 'and model selection (Section 8.6). Although this is an interesting topic; it '\n",
            " 'is beyond the scope of this book: 2.1.5 Regression Versus Classification '\n",
            " 'Problems Variables can be characterized as either quantitative or '\n",
            " 'qualitative (also quantitative known as categorical)_ Quantitative variables '\n",
            " \"take on numerical values qualitative Examples include a person's age, \"\n",
            " 'height, Or income, the value of a house, categorical and the price of a '\n",
            " 'stock. In contrast, qualitative variables take on values in one of K '\n",
            " 'different classes; or categories. The rows in the figure show the resulting '\n",
            " 'archetypes from three runs, specifying two, three and four archetypes, '\n",
            " 'respectively: fluences, and other driving forces that may be hard to '\n",
            " 'identify or measure. Factor analysis is a classical technique developed in '\n",
            " 'the statistical liter- ature that aims to identify these latent sources. '\n",
            " 'Factor analysis models are typically wed to Gaussian distributions; which '\n",
            " 'has to some extent hin- dered their usefulness. Robust Loss Functions for '\n",
            " 'Classification Although both the exponential (10.8) and binomial deviance '\n",
            " '(10.18) yield the same solution when applied to the population joint '\n",
            " 'distribution, the same is not true for finite data sets. Both criteria are '\n",
            " 'monotone decreasing functions of the \"ma...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 5 of 16: Kernels'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('support-vector machines ( SVMs ) can be made into a multiclass classifier ; '\n",
            " 'but there are various difficulties with this approach, especially in the '\n",
            " 'multi-class classification setting. it is possible to obtain sparse, '\n",
            " 'probabilistic, multi - class kernel-based classifiers, which work as well or '\n",
            " 'better than SVm, using techniques such as the lIVM or the russian process '\n",
            " 'machine ( rVM ). 14.2 Kernel functions : we define a kernel function to be a '\n",
            " \"real-valued function of two arguments, k(x,x') and be computed in o(lxl lx' \"\n",
            " '| ) time (for certain settings of the weights {ws}) using suffix trees '\n",
            " '(Leslie et al.2003; Vishwanathan and Smola 2003; Shawe-Taylor and '\n",
            " 'Cristianini 2004). 13.7 kernels for building generative models There is a '\n",
            " 'different kind of kernel known as a smoothing kernel which can also be used '\n",
            " 'to create non')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('The kernel trick 495 Algorithm 14.2: Kernel PCA 1 Input: K of size N X N, K* '\n",
            " 'of size N* X N, num. 14.7 Kernels for building generative models There is a '\n",
            " 'different kind of kernel known as a smoothing kernel which can be used to '\n",
            " 'create non-parametric density estimates. We define the kernel between two '\n",
            " \"strings % and &' as K(E , %' ) = Wsds(x)ds(& s<CUR>A (14.22) where ws > 0 \"\n",
            " 'and A* is the set of all strings (of any length) from the alphabet A (this '\n",
            " 'is known as the Kleene star operator) This is a Mercer kernel, and be '\n",
            " \"computed in O(lxl lx' | ) time (for certain settings of the weights {ws}) \"\n",
            " 'using suffix trees (Leslie et al. 2003; Vishwanathan and Smola 2003; '\n",
            " 'Shawe-Taylor and Cristianini 2004). There are various cases of interest. An '\n",
            " 'example of a kernel that is not a Mercer kernel is the so-called sigmoid '\n",
            " 'kernel; defined by tanh(rx T_ n(x,x) = x +r) (14.17) (Note that this uses '\n",
            " 'the tanh function even though it is called a sigmoid kernel:) This kernel '\n",
            " 'was inspired by the multi-layer perceptron (see Section 16.5), but there is '\n",
            " 'no real reason to use it. (For a true \"neural net kernel\" which is positive '\n",
            " 'definite, see Section 15.4.5.) In general, establishing that a kernel is a '\n",
            " 'Mercer kernel is difficult; and requires techniques from functional '\n",
            " 'analysis. Exponentially sized dimensions representing all monies and so all '\n",
            " 'products of up to the van Riables and from original, original do mean are so '\n",
            " 'those we already have seen and in both of these cases, these features as '\n",
            " \"space is actually five dimensional, but as others where it's not and so one \"\n",
            " 'may be more widely used kernel in practice is called the Gaussian colonel '\n",
            " 'colonel. 2009) for more information: 14.5.5 A probabilistic interpretation '\n",
            " 'of SVMs In Section 14.3, we saw how to use kernels inside GLMs to derive '\n",
            " 'probabilistic classifiers, such as the LIVM and RVM And in Section 15.3, we '\n",
            " 'will discuss Gaussian process classifiers, which also use kernels. The SVM '\n",
            " 'can be made into a multiclass classifier; but there are various diffic...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 6 of 16: Neural Networks'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('a special family of convolutional neural networks (CNNs ) has evolved for '\n",
            " 'classifying images such as these, and has shown spectacular success on a '\n",
            " 'neural wide range of problems. when used at full strength they can be quite '\n",
            " 'com- plex we illustrate their use in two simple applications. there is a '\n",
            " 'designated training set of 50,000 images to be learned from ; and a test set '\n",
            " 'that is designed to demonstrate the power of recurrent neural network '\n",
            " 'techniques. with this feedback, participants were then asked to submit '\n",
            " 'predictions for a separate test - set and they received their results_ '\n",
            " 'Finally; the class labels for the validation set were released and '\n",
            " 'participants had one week to train their algorithms on the combined training '\n",
            " 'and validation sets, and submit their final pre dictions to the competition '\n",
            " 'website @xmath0 <CUR> ] [1] 0 4698 npred) ^2) 1 Vo 10.10.5 Recurrent Neural '\n",
            " 'Networks 425 8 2 @ 8 3 9 2 2 9 & # 1 g 3')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('There was a lot of excite- network ment and a certain amount of hype '\n",
            " 'associated with this approach; and they were the impetus for the popular '\n",
            " 'Neural Information Processing Systems meetings (NeuIPS, formerly NIPS) held '\n",
            " 'every year , typically in exotic places like ski resorts. This was followed '\n",
            " 'by a synthesis stage, where the properties of neural networks were analyzed '\n",
            " 'by machine learners, math- ematicians and statisticians; algorithms were '\n",
            " 'improved, and the method- ology stabilized. Since 08,00,s (v) = Bo(ao + sv) '\n",
            " 'has lower complexity than a more general nonparametric g(v) , it is not '\n",
            " 'surprising that a neural network might use 20 or 100 such functions, while '\n",
            " 'the PPR model typically uses fewer terms ( M = 5 or 10, for example). '\n",
            " 'Finally; we note that the name \"neural networks\" derives from the fact that '\n",
            " 'they were first developed as models for the human brain: Each unit '\n",
            " 'represents a neuron, and the connections (links in Figure 11.2) represent '\n",
            " 'synapses. Furthermore, for some learning tasks the response is also a '\n",
            " 'sequence, and s0 the output sequence {01, 02, OL} is explicitly needed. When '\n",
            " 'used at full strength; recurrent neural networks can be quite com- plex We '\n",
            " 'illustrate their use in two simple applications. The numbers for each image '\n",
            " 'are organized in a three-dimensional array called a feature map. The first '\n",
            " 'two feature map axes are spatial (both are 32-dimensional) , and the third '\n",
            " 'is the channel 6 channel axis; representing the three colors. There is a '\n",
            " 'designated training set of 50,000 images, and a test set of 10,000. A '\n",
            " 'special family of convolutional neural networks (CNNs) has evolved for '\n",
            " 'convolutional classifying images such as these, and has shown spectacular '\n",
            " 'success on a neural wide range of problems. Huber (1985) gives a scholarly '\n",
            " 'overview, and Roosen and Hastie (1994) present a formulation using smoothing '\n",
            " 'splines: The motivation for neural networks dates back to McCulloch and '\n",
            " 'Pitts (1943) , Widrow and Hoff (1960) (reprinted in An- derson and Rosenfeld '\n",
            " '(1988)) an...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 7 of 16: Clustering'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('in this chapter, we give a pedagogical introduction to the concepts of '\n",
            " 'clustering, biclustering and clustering. we discuss the issue of how to '\n",
            " 'choose the heightl number of clusters below : if we cut the tree at any '\n",
            " 'given height ; we induce a clustering of a given size. (a) bayesian method '\n",
            " '(b) average linkage: we show the 3 most probable words per cluster: (c) nGnh '\n",
            " 'ieG i<CUR>H where ng and nH are the number elements in groups g and h. see '\n",
            " 'Figure 5 of (Heller and Ghahramani 2005 ) for details. using a priority '\n",
            " 'queue, this can be reduced to o(N2 1og N) (see eg,, (Manning et al 2008, ch: '\n",
            " \"17)for details). concerning the data is standardized, then corr [xi, Xi'] lj \"\n",
            " \"Cijli' j' and hence c;\")\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('(a) Bayesian method (b) Average linkage: We show the 3 most probable words '\n",
            " 'per cluster: The number of documents at each cluster is also given: Source: '\n",
            " 'Figure 5 of (Heller and Ghahramani 2005). Used with kind permission of '\n",
            " 'Katherine Heller: 25.6.1 Biclustering Clustering the rows and columns is '\n",
            " 'known as biclustering o coclustering: This is widely used in bioinformatics, '\n",
            " 'where the rows often represent genes and the columns represent conditions. '\n",
            " '(1 - Cv) 1=1 1=1 (25.23) Tk (25.24) 884 Chapter 25. Clustering This is often '\n",
            " 'denoted by TT N GEM(a) (25.25) where GEM stands for Griffiths, Engen and '\n",
            " 'McCloskey (this term is due to (Ewens 1990)). Some samples from this process '\n",
            " 'are shown in Figure 25.5. This is called the Chinese restaurant process or '\n",
            " 'CRP, based on the seemingly infinite supply of tables at certain Chinese '\n",
            " 'restaurants. The analogy is as follows: The tables are like clusters, and '\n",
            " 'the customers are like observations. When a person enters the restaurant; he '\n",
            " 'may choose to join an existing table with probability proportional to the '\n",
            " 'number of people already sitting at this table (the Nk); otherwise, with a '\n",
            " 'probability that diminishes as more people enter the room (due to the 1/(a + '\n",
            " 'N) term), 25.2. Complete linkage represents the opposite extreme: two groups '\n",
            " 'are considered close only if all of the observations in their union are '\n",
            " 'relatively similar: This will tend to produce clusterings with small '\n",
            " 'diameter, ie , compact clusters. 25.5.1.3 Average link In practice, the '\n",
            " 'preferred method is average link clustering, which measures the average '\n",
            " \"distance between all pairs: davg ' (G, H) = di,i' (25.51) nGnH ieG i<CUR>H \"\n",
            " 'where nG and nH are the number of elements in groups G and H. See Figure '\n",
            " '25.14(c). For this data, complete and average linkage generally separate the '\n",
            " 'observations into their correct groups. However, single linkage identifies '\n",
            " 'one point as belonging to its own cluster. A more sensible answer is '\n",
            " 'obtained when four clusters are selected; although there are still two...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 8 of 16: Dimensionality Reduction'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('the curse of dimensionality plays a key role in high-dimensional settings. '\n",
            " 'we examine several manifestations of this phenomenon, and highlight three '\n",
            " 'important points : (1) regularization or shrinkage plays an important role ; '\n",
            " '(2) appropriate tuning parameter selection is crucial for good predictive '\n",
            " 'performance ; and (3) the test error tends to increase as the dimensional of '\n",
            " \"the problem ( i.e. the '' curse '' ) increases. 1.3.2 Discovering latent \"\n",
            " 'factors when dealing with high dimensional data, it is often useful to '\n",
            " 'reduce the dimensions by projecting the data to a lower dimensional subspace '\n",
            " 'which captures the essence\" of that data; this is called dimensionally '\n",
            " 'reduction. 2.8 Fraction of data in neighborhood (a) (b) Figure 10.1 gives an '\n",
            " 'illustrative example in two dimensions. Dimensionality reduction generally '\n",
            " 'exploits a property of high dimen- sional data ( e.g-, images ) that it '\n",
            " 'often lies on a low-dimension subspaces')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('Each hidden unit is obtained by convolving with the appropriate filter; and '\n",
            " 'then summing over the input planes The final layer is obtained by computing '\n",
            " 'the local maximum within a small window. Source: Figure 1 of (Chen et al. '\n",
            " '2010) Used with kind permission of Bo Chen: faces, cars, airplanes, '\n",
            " 'motorbikes (a) (b) Figure 28.9 Visualization of the filters learned by a '\n",
            " 'convolutional DBN in layers two and three. Source: Figure 3 of (Lee et al. '\n",
            " '2009). 2011, Theorem 14). (The troublesome cases arise when there are '\n",
            " 'fractional assignments with the same optimal value as the MAP estimate.) The '\n",
            " 'blue lines track the outer quantiles of the residuals, and emphasize '\n",
            " 'patterns. Left: The funnel shape indicates heteroscedasticity: Right: The '\n",
            " 'response has been log transformed, and there is now no evidence of '\n",
            " 'heteroscedasticity: terms can also occur outside of time series data_ For '\n",
            " \"instance, consider a study in which individuals' heights are predicted from \"\n",
            " 'their weights. The assumption of uncorrelated errors could be violated if '\n",
            " 'some of the indi- viduals in the study are members of the same family; eat '\n",
            " 'the same diet, or have been exposed to the same environmental factors. The '\n",
            " 'poor performance in high dimensional settings is due to the curse of '\n",
            " 'dimensionality: To explain the curse, we give some examples from (Hastie et '\n",
            " 'al. 2009, p22). It would seem that with a reasonably large set of training '\n",
            " 'data, we could always approximate the theoretically optimal conditional '\n",
            " 'expectation by k-nearest-neighbor averaging; since we should be able to find '\n",
            " 'a fairly large neighborhood of observations close to any 1 and average them. '\n",
            " 'This approach and our intuition breaks down in high dimensions, and the '\n",
            " 'phenomenon is commonly referred to as the curse of dimensionality (Bellman, '\n",
            " '1961). There are many manifestations of this problem, and we will examine a '\n",
            " 'few here. 2007b) because of the use of the <CUR>1-penalty Typically, it '\n",
            " \"shrinks some of the non- diagonal elements exactly to zero, i.e. 'j,k (1) 0 \"\n",
            " 'for some ...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 9 of 16: Decision Theory'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('in decision theory, the regression function y(x)_ which minimizes the '\n",
            " 'expected squared loss, is given by the mean of the conditional distri- '\n",
            " 'bution p(tlxo) x0 which is the average of t conditioned on x. a variant f '\n",
            " 'this quantity, called the bayesian information criterion, or bIC, will be '\n",
            " 'discussed in section 4.4.1.2. 6 Models; Statistical Inference and Learning '\n",
            " '6.6 models ; probabilistic models 5.3 models for which the quan- tity m ( '\n",
            " '1.73 ) is largest : here @p(dlwML)__is the best - fit log likelihood, and '\n",
            " '@@x denotes the number of adjustable parameters in the model.7 models that '\n",
            " 'satisfy weak regularity conditions, the maximum likelihood estimator is '\n",
            " 'approximately minimax: 1 \"Well-behaved\\' means that the level sets must be '\n",
            " 'convex and symmetric about the origin. 11.')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('This provided the first rigorous proof that probability theory could be '\n",
            " 'regarded as an extension of Boolean logic to situations involving '\n",
            " 'uncertainty (Jaynes, 2003) Numerous other authors have proposed different '\n",
            " 'sets of properties or axioms that such measures of uncertainty should '\n",
            " 'satisfy (Ramsey, 1931; Good, 1950; Savage, 1961; deFinetti, 1970; Lindley, '\n",
            " '1982). For example, the Akaike information criterion, or AIC (Akaike, 1974), '\n",
            " 'chooses the model for which the quan- tity In p(DlwML) _ M (1.73) is '\n",
            " 'largest: Here p(DlwML) is the best-fit log likelihood, and M is the number '\n",
            " 'of adjustable parameters in the model. A variant f this quantity, called the '\n",
            " 'Bayesian information criterion, Or BIC, will be discussed in Section 4.4.1. '\n",
            " 'To combine prior beliefs with data in a principled way; use Bayesian in- '\n",
            " 'ference. To construct procedures with guaranteed long run performance, such '\n",
            " 'as confidence intervals; use frequentist methods. Generally; Bayesian '\n",
            " 'methods run into problems when the parameter space is high dimensional. In '\n",
            " 'particu- lar , 95 percent posterior intervals need not contain the true '\n",
            " 'value 95 percent of the time (in the frequency sense) - 11.10 Bibliographic '\n",
            " 'Remarks Some references on Bayesian inference include Carlin and Louis '\n",
            " '(1996) , Gel- man et al. (1995) , Lee (1997) , Robert (1994) , and Schervish '\n",
            " '(1995) . Decision Theory 47 Figure 1.28 The regression function y(x)_ which '\n",
            " 'minimizes the expected squared loss, is given by the mean of the conditional '\n",
            " 'distri- bution p(tlx) . y(x) y(x0) p(tlxo) x0 which is the conditional '\n",
            " 'average of t conditioned on x and is known as the regression function_ This '\n",
            " 'result is illustrated in Figure 1.28. 5.7. Bayesian decision theory 177 Our '\n",
            " 'goal is to devise a decision procedure or policy, <CUR> : x - A which '\n",
            " 'specifies the optimal action for each possible input: By optimal, we mean '\n",
            " 'the action that minimizes the expected loss: 6(x) 3 argminE [L(y , a)] aeA '\n",
            " '(5.96) In economics, it is more common to talk of a utility function; this '\n",
            " 'is just n...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 10 of 16: Maximum Likelihood'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('for parametric models that satisfy weak regularity conditions, the maximum '\n",
            " 'likelihood estimator is approximately minimax Consider squared error loss '\n",
            " 'which is squared bias plus variance. : the presented rate is best for 8 0 '\n",
            " \"close to zero resulting in 4-6' log pn_ log_ (pn_ b'l1 n n Op max for any 0 \"\n",
            " \"8' ( < 5/44) ( and the upper bounds for & and &' are not of special inter- \"\n",
            " 'est ). such a consistency result for prediction also holds for the lasso, as '\n",
            " 'described in formula (2.7) in chapter 2 or with more details given in '\n",
            " 'corollary 6.1 in section @xmath0 1.5 Maximum Likelihood, Minimax, and Bayes '\n",
            " '201 12.10 10.4 9.8 3.3 Conditional probability we define the conditional '\n",
            " 'probability of event A, given that event B is true ; if p(a,b)p(AIB)')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('(11.16) is known as the partial likelihood because it is not exactly a '\n",
            " 'likelihood. Hence; Theorem 12.10 implies that 0 is minimax: 1 '\n",
            " '\"Well-behaved\\' means that the level sets must be convex and symmetric about '\n",
            " 'the origin. The result holds up to sets of measure 0_ 12.5 Maximum '\n",
            " 'Likelihood, Minimax, and Bayes 201 12.5 Maximum Likelihood; Minimax, and '\n",
            " 'Bayes For parametric models that satisfy weak regularity conditions, the '\n",
            " 'maximum likelihood estimator is approximately minimax Consider squared error '\n",
            " 'loss which is squared bias plus variance. Actually finds a maximum '\n",
            " 'likelihood very simple Why, so you have here you avoid your likelihood and '\n",
            " 'again a classical maximum likelihood you have here as the full formula for '\n",
            " 'the likelihood and if you replace and you use the notation a pat of joy for '\n",
            " \"its number of times my equal job divided by and you're going to have. To \"\n",
            " 'combine prior beliefs with data in a principled way; use Bayesian in- '\n",
            " 'ference. To construct procedures with guaranteed long run performance, such '\n",
            " 'as confidence intervals; use frequentist methods. Generally; Bayesian '\n",
            " 'methods run into problems when the parameter space is high dimensional. In '\n",
            " 'particu- lar , 95 percent posterior intervals need not contain the true '\n",
            " 'value 95 percent of the time (in the frequency sense) - 11.10 Bibliographic '\n",
            " 'Remarks Some references on Bayesian inference include Carlin and Louis '\n",
            " '(1996) , Gel- man et al. (1995) , Lee (1997) , Robert (1994) , and Schervish '\n",
            " '(1995) . Actually finds a maxim Um likelihood very simply so you have here '\n",
            " 'you avoid your likelihood and again a classical maximum likelihood you have '\n",
            " 'here as the full formula for the likelihood and if you replace and you use '\n",
            " 'the notation a pat of joy for its number of times my equal job divided by '\n",
            " \"and you're going to have. : the presented rate is best for 8 0 close to zero \"\n",
            " \"resulting in 4-6' log pn_ log_ (pn_ B'l1 n n Op max for any 0 8' ( < 5/44) \"\n",
            " \"(and the upper bounds for & and &' are not of special inter- est). Such a \"\n",
            " 'consistency result for...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 11 of 16: Bootstrapping'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('these are ensemble methods for which the simple building block is a '\n",
            " 'regression o a classification tree. an eclectic online retailer sells two '\n",
            " 'items: socks and comput- ers Left: the number of pairs of socks, and '\n",
            " 'computers, purchased by eight online shoppers is displayed. 8.4 '\n",
            " 'bibliographic Remarks The bootstrap was invented by Efron (1979). And why I '\n",
            " 'would take the legend because it only has to ride come as in the '\n",
            " 'intersection the my antes and slow toe that you be beveryafise meteco great '\n",
            " 'distance between points and cost pockets. 7.7.1 Backpropagation How do we '\n",
            " 'find the directions to move 0 SO as to decrease the objective r(0) in '\n",
            " '(10.25)? You cannot have a finger out, but a size is like what you can fit '\n",
            " 'let then see what there is in this input i think it would be good if you '\n",
            " \"look thank you very much. Do so let's focus on padding for\")\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('There are several books on these topics including Efron and Tibshirani '\n",
            " '(1993) , Davison and Hinkley (1997) , Hall (1992) and Shao and Tu (1995) '\n",
            " 'Also, see section 3.6 of van der Vaart and Wellner (1996)_ 8.5 Appendix '\n",
            " '8.5.1 The Jackknife There is another method for computing standard errors '\n",
            " 'called the jackknife, due to Quenouille (1949). It is less computationally '\n",
            " 'expensive than the boot- ~0.- 116 8 The Bootstrap strap but is less general. '\n",
            " 'Let Tn = T(X1,_ Xn) be a statistic and Tr-i) de- note the statistic with the '\n",
            " 'ith observation removed. One can visualize (Figure 10.17) standing in a '\n",
            " 'mountainous terrain, and the goal is to get to the bottom through a series '\n",
            " 'of steps. As long as each step goes downhill, we must eventually get to the '\n",
            " 'bottom. In this case we were lucky; because with our starting guess 00 we '\n",
            " 'end up at the global minimum In general we can hope to end up at a (good) '\n",
            " 'local minimum 10.7.1 Backpropagation How do we find the directions to move 0 '\n",
            " 'SO as to decrease the objective R(0) in (10.25) ? You cannot have a finger '\n",
            " 'out, but a size is like what you can fit let then see what you can fit in '\n",
            " 'the input I know if this clear if I think someone is united I think it would '\n",
            " \"be good if you look thank you very much. Do so let's focus on padding for a \"\n",
            " \"moment, so does this gray frame so often it's used to preserve the inch \"\n",
            " \"boots a special size red so it's also cold sane padding so we've learned \"\n",
            " 'that are added. A throw and fro an area an atrou an to a stop and thru and a '\n",
            " 'toe an a thou and true and untrue and author and true and a thou and a hero '\n",
            " 'an for an author and true and Stop an athrou a to and a thou and thru an '\n",
            " 'auto on atrou a to and atou and a heron a two and a thou and a throw and. In '\n",
            " \"the grass whilst really three and are they playing out the running I don't \"\n",
            " \"know Right the right, it's as as a skate porter does a trick on a ramp well, \"\n",
            " \"it's not really a skate porter, but a stunt bicyclist and so on right so it \"\n",
            " \"doesn't perfectly get it, but it's a quite, quite good ...\")\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 12 of 16: Bayesian Viewpoint'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('in this paper, we discuss some of the technicalities of nonparametric '\n",
            " 'bayesian inference. special emphasis is put on frequentist methods and '\n",
            " 'empirical bayes (EB ) or type-II maximum likelihood : in machine learning, '\n",
            " 'it is sometimes called the evidence procedure Empirical Bayes violates the '\n",
            " 'principle that the prior should be chosen independently of data. to '\n",
            " 'construct procedures with guaranteed long run performance, such as '\n",
            " 'confidence intervals ; use frequentism methods. 11.10 bibliographic Remarks '\n",
            " 'Some references on statistical inference include carlin and Louis (1996), '\n",
            " 'Gel- man et al. (1995),Lee (1997),Robert (1994)@xmath0, and Schervish ( '\n",
            " '1995). see Cox (1993)and diaconis and Freedman (1986)for discus- sions of a '\n",
            " 'few of these references. See berger and Delampady (1987)19011.1 for an '\n",
            " 'overview of our discussion so far. + * ams')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('To combine prior beliefs with data in a principled way; use Bayesian in- '\n",
            " 'ference. To construct procedures with guaranteed long run performance, such '\n",
            " 'as confidence intervals; use frequentist methods. Generally; Bayesian '\n",
            " 'methods run into problems when the parameter space is high dimensional. In '\n",
            " 'particu- lar , 95 percent posterior intervals need not contain the true '\n",
            " 'value 95 percent of the time (in the frequency sense) - 11.10 Bibliographic '\n",
            " 'Remarks Some references on Bayesian inference include Carlin and Louis '\n",
            " '(1996) , Gel- man et al. (1995) , Lee (1997) , Robert (1994) , and Schervish '\n",
            " '(1995) . See Cox (1993) , Diaconis and Freedman (1986) , Freedman (1999) , '\n",
            " 'Barron et al. (1999) , Ghosal et al. (2000) , Shen and Wasserman (2001) , '\n",
            " 'and Zhao (2000) for discus- sions of some of the technicalities of '\n",
            " 'nonparametric Bayesian inference. The Robins-Ritov example is discussed in '\n",
            " 'detail in Robins and Ritov (1997) where it is cast more properly as a '\n",
            " 'nonparametric problem. Example 11.10 is due to Edward George (personal '\n",
            " 'communication) . See Berger and Delampady (1987) 190 11. This overall '\n",
            " 'approach is called empirical Bayes (EB) or type-II maximum likelihood: In '\n",
            " 'machine learning, it is sometimes called the evidence procedure Empirical '\n",
            " 'Bayes violates the principle that the prior should be chosen independently '\n",
            " 'of the data. 11 Bayesian Inference 11.1 The Bayesian Philosophy The '\n",
            " 'statistical methods that we have discussed s0 far are known as frequen- tist '\n",
            " '(or classical) methods. The frequentist point of view is based on the '\n",
            " 'following postulates: Fl Probability refers to limiting relative frequencies '\n",
            " 'Probabilities are ob- jective properties of the real world. F2 Parameters '\n",
            " 'are fixed, unknown constants. (a) shows the belief state at frame 1 The '\n",
            " 'system has had to resample 5 times to keep the effective sample size above '\n",
            " 'the threshold of 150; () shows the belief state at frame 251; the red lines '\n",
            " 'show the estimated location of the center of the object over the last 250 '\n",
            " 'frames (c) shows that ...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 13 of 16: Gaussian Mixture Models'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('distributions over continuous vari- ables are described by mixtures of '\n",
            " 'gaussians. this model is also known as latent class analysis ( lazarsfeld '\n",
            " 'and Henry, 1968; McLachlan and Peel, 2000 ) formalizes this approach in '\n",
            " 'terms of a regression model that explicitly pools information. 11.2.1 '\n",
            " 'mixture models 343 expert predictions, fixed mixing weights-0 gating '\n",
            " \"functions for three different 'experts' 12.5 bayesian methods for \"\n",
            " '\"fitting\" LG-SSMs there are various offline Bayesian alternatives to the '\n",
            " 'maximum likelihood ( EM ) algorithm ; including variational Bayes ( beal '\n",
            " '2003; Barber and Chiappa 2007) and blocked Gibbs sampling (Carter and kohn '\n",
            " '1994; cappe et al. 2005; Fruhwirth-Schnatter 2007). 10.7.3 matrix '\n",
            " 'cross-covariance matrix yTX/n ( assuming y and x are centered ) @xmath0 '\n",
            " '<CUR> ] 0.8 <CUR>')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('0.8 0.2 0.3 0.6 0.8 0.9 (a) (b) Figure 11.3 A mixture of 3 Gaussians in 2d. '\n",
            " '(a) We show the contours of constant probability for each component in the '\n",
            " 'mixture. () A surface plot of the overall density Based on Figure 2.23 of '\n",
            " '(Bishop 2006a). Figure generated by mixGaussPlotDemo. 11.2.1 Mixtures of '\n",
            " 'Gaussians The most widely used mixture model is the mixture of Gaussians '\n",
            " '(MOG), also called a Gaussian mixture model or GMM In this model, each base '\n",
            " 'distribution in the mixture is a multivariate Gaussian with mean pk and '\n",
            " 'covariance matrix Zk. 9.3.3 Mixtures f Bernoulli distributions So far in '\n",
            " 'this chapter; we have focussed on distributions over continuous vari- ables '\n",
            " 'described by mixtures of Gaussians. As a further example of mixture mod- '\n",
            " 'elling, and to illustrate the EM algorithm in a different context; we now '\n",
            " 'discuss mix- tures of discrete binary variables described by Bernoulli '\n",
            " 'distributions. This model is also known as latent class analysis (Lazarsfeld '\n",
            " 'and Henry, 1968; McLachlan and Peel, 2000). The Gaussian mixture model has '\n",
            " 'the form M f (a) = @m O(x; pm , Zm (6.32) m=1 with mixing proportions @m , '\n",
            " '@m = 1, and each Gaussian density has m a mean m and covariance matrix Zm In '\n",
            " 'general, mixture models can use any component densities in place of the '\n",
            " 'Gaussian in (6.32): the Gaussian mixture model is by far the most popular: '\n",
            " 'The parameters are usually fit by maximum likelihood, using the EM algorithm '\n",
            " 'as described in Chapter &. Mixture models 343 expert predictions, fixed '\n",
            " 'mixing weights-0 gating functions; fixed mixing weights-0 11 5 -0.5 -0.5 (a) '\n",
            " '(b) predicted mean and var; fixed mixing weights-0 -1.5 -0.5 0.5 (c) Figure '\n",
            " '1.6 (a) Some data fit with three separate regression lines. (b) Gating '\n",
            " 'functions for three different \\'experts\" . (c) The conditionally weighted '\n",
            " 'average of the three expert predictions Figure generated by mixexpDemo. @Ci '\n",
            " 'aCi Zi zi Yi (a) (b) Figure 11.7 (a) A mixture of experts (b) A hierarchical '\n",
            " 'mixture of experts. 12 Latent linear models 12.1 Factor analysis...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 14 of 16: Beans'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('we may want to forecast the weather several days ahead, or cli- mate several '\n",
            " 'decades ahead. ( s0 on ) ; where we track market indices, trading volumes, '\n",
            " 'stock and bond prices, and exchange rates. time series of temperature, '\n",
            " 'rainfall, wind speed; air quality; and s 0 on. [0 10003 20 1 00 0] [3.01.1 '\n",
            " '2.3001.2 ] a procrustes transforma- tion applies a translation and rotation '\n",
            " 'to best match up the two set of points. if we zoom out as in the third '\n",
            " 'image, it gets confused and chooses the fountain rather than the hawk: In '\n",
            " 'the final image a \"\\'jacamar\" is a tropical bird from South and Central '\n",
            " 'America with similar coloring to the South African Cape Weaver. Mendel bred '\n",
            " 'peas with round yellow seeds and wrinkled green seeds. The number of each '\n",
            " 'type is multinomial with probability p 3 (p1,p2,P3')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('Trouble so we have two cereals we have two meals by one of x, y open k two '\n",
            " 'off and then, since these are channels for each of these canals there is a '\n",
            " \"feature map for a one we have the one that MA P's x two that's called first \"\n",
            " 'Hilbert pace and off. All maps x two second space in which we have in each '\n",
            " 'of these Hilbert spaces we have our inner product and one of x, y equals two '\n",
            " 'in the product of that one of x is one of my hone. To come up with a new '\n",
            " 'corneal and that is possible right and where a couple of of composition '\n",
            " 'words can actually prove These as so typically a sum of two kernels is a '\n",
            " 'corneal on a lot of this essentially derives from from the property of '\n",
            " 'knowing that the that a colonel is is to be semi positively, definitely a '\n",
            " 'product of cranes as a corneal he can multiply a constant to a cornea and a '\n",
            " 'slate corneal and. The p-value is P(xk_1 > t) where t is the observed value '\n",
            " 'of the test statistic Theorem 10.17 is illustrated in Figure 10.5. 10.5 The '\n",
            " \"Permutation Test 161 10.18 Example (Mendel's peas):. Mendel bred peas with \"\n",
            " 'round yellow seeds and wrinkled green seeds. There are four types of '\n",
            " 'progeny: round yellow_ wrinkled yellow_ round green; and wrinkled green. The '\n",
            " 'number of each type is multinomial with probability p 3 (p1, P2, P3, p4). '\n",
            " 'Their data, which is over 10OGB when uncompressed; is publically available: '\n",
            " '3 An example of their data, for a set of 4-grams, is shown below serve as '\n",
            " 'the incoming 92 serve as the incubator 99 serve as the independent 794 serve '\n",
            " 'as the index 223 serve as the indication 72 serve as the indicator 120 serve '\n",
            " 'as the indicators 45 serve as the indispensable 111 serve as the '\n",
            " 'indispensible 40 serve as the individual 234 Although such an approach, '\n",
            " 'based on \"brute force and ignorance\" can be successful, it is rather '\n",
            " 'unsatisfying, since it is clear that this is not how humans learn (see eg,, '\n",
            " '(Tenenbaum and Xu 2000)). Ok and this is this is Panama keeps us busy pet '\n",
            " 'the because the mix mod so jester as water a bit cotton, cotton next mode '\n",
            " 'ga...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 15 of 16: Generative Adversarial Networks'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('the concept of dissimilarity between two observations needs to be extended '\n",
            " 'to a pair of groups of observa- tions. this extension is achieved by '\n",
            " 'developing the notion of linkage, which defines the difference between '\n",
            " '(disjoint) pairs of observations. for this data, complete and average '\n",
            " 'linkage generally separate the observations into their correct groups. '\n",
            " 'however, single linkage identifies one point as belonging to its own cluster '\n",
            " '; although there are still two singletons. more generally; interval '\n",
            " 'censoring refers to the setting in which we do not know the exact event '\n",
            " 'time, but we know that it falls in some interval For instance, this setting '\n",
            " 'arises if we survey patients once per week in order to determine whether the '\n",
            " 'event has occurred. our simulated data will rep- resent the observed wait '\n",
            " 'times (in seconds ) for 2,000 customers who have phoned a call center. There '\n",
            " 'are three covariates : Operators ( the number of operators available at the '\n",
            " 'time of the call ), Center (')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('survdata () function, which is part of the coxed library: Our simulated data '\n",
            " 'will rep- resent the observed wait times (in seconds) for 2,000 customers '\n",
            " 'who have phoned a call center. In this context, censoring occurs if a '\n",
            " 'customer hangs up before his or her call is answered. There are three '\n",
            " 'covariates: Operators (the number of call center operators available at the '\n",
            " 'time of the call, which can range from 5 to 15), Center (either A, B; or C), '\n",
            " 'and Tine of day (Morning; Afternoon; or Evening). effect data The \"observed\" '\n",
            " 'data is stored in queuingsdata, with y corresponding to the event time and '\n",
            " 'failed an indicator of whether the call was answered (failed = T) or the '\n",
            " 'customer hung up before the call was answered (failed = F)_ We see that '\n",
            " 'almost 90% of calls were answered. More generally; interval censoring refers '\n",
            " 'to the setting in which we do not know the exact event time, but we know '\n",
            " 'that it falls in some interval For instance, this setting arises if we '\n",
            " 'survey patients once per week in order to determine whether the event has '\n",
            " 'occurred. The concept of dissimilarity between a pair of observations needs '\n",
            " 'to be extended to a pair of groups of observations: This extension is '\n",
            " 'achieved by developing the notion of linkage, which defines the '\n",
            " 'dissimilarity between two groups of observa- tions. The four most common '\n",
            " 'types of linkage complete, average; single, linkage and centroid are briefly '\n",
            " 'described in Table 12.3. Average, complete, and526 12_ Unsupervised Learning '\n",
            " 'Algorithm 12.3 Hierarchical Clustering 1. Generative Adversarial Networks; '\n",
            " \"Salimans et al'16 Improved Techniques for training GANs] Ezurich Progress in \"\n",
            " 'GANs 2014 2015 2016 2017 [Brundage et al, 2018 The Malicious Use of '\n",
            " 'Artificial Intelligence: Forecasting, Prevention, and Mitigation ] Ezurich '\n",
            " \"Newer examples 0 Ezurich [Kerras et al '18 Progressive Growing of GANs for \"\n",
            " 'Improved Quality, Stability, and Variation] Generative Adversarial Networks '\n",
            " '1 _ Simultaneously train two neural networks Generator G tries to produce '\n",
            " 'realist...')\n"
          ]
        },
        {
          "data": {
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/javascript": [
              "download(\"download_03e3c11d-5601-4b90-a588-ba7c4eab2a37\", \"SummarySearchTerms_intro to ML_exported__main_v1_allenailedlarge16384arxiv08.19.2021, 2305.txt\", 15786)"
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "metadata": {
            "tags": null
          },
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "Completed Summary Search -  08.19.2021, 23-05\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Vs8H1v3eCLcP"
      },
      "source": [
        "run other lists: "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "tvaRMyjHCKoN",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000,
          "referenced_widgets": [
            "6d1cccc713dd438597db2af871b2f5ba",
            "7f30902493d549ba8cb0bdbc3df13509",
            "912bb412662b4c669aef4be773d94112",
            "89eded059d964f0e998777a81acfc6c4",
            "db3fb38ca1fd416f97f04cb06c50e62a",
            "18cad2a242a34155a4fc532f38d47e0d",
            "c0bf713a94ac4bd29e7eb882d7f84124",
            "a6726f99abbb41d89536dfdc4a29e3df",
            "7a098fc570b246f7beaf773dfcd53eae",
            "ad52d8079b294934a92e9001408de400",
            "04c44f25fe814601804e5bfd0a3c144d",
            "636c5a35860042c59953c7d317190638"
          ]
        },
        "outputId": "25e262ac-3582-43e6-d888-6608bce2ccb5"
      },
      "source": [
        "import time\n",
        "other_lists = {\n",
        "    # define any additional term lists here \n",
        "        \"terms from pattern recog textbook\":PR_ML,\n",
        "        \"terms from past exams\":past_exams,\n",
        "}\n",
        "if run_search_term_list and len(other_lists) > 0:\n",
        "    # iterates through additional lists if any\n",
        "    \n",
        "    for key, value in other_lists.items():\n",
        "\n",
        "        search_for_terms(sumsearch_pipe, value, \n",
        "                         k_search=num_docs_pl, # decrease_if_crash\n",
        "                         add_text=key + \"_\", export_txt=False)\n",
        "        print(\"\\n\\n\\n\\n Moving to next term list {} \\n\\n\\n\".format(time.time()))"
      ],
      "execution_count": 55,
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "6d1cccc713dd438597db2af871b2f5ba",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "getting defs for search_terms...:   0%|          | 0/9 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "metadata": {
            "tags": null
          },
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 0 of 9: gaussian distribution'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('the kullback-Leibler divergence is a special case of two more general '\n",
            " 'classes of divergences called bregman and f-divergence. just like the '\n",
            " 'Euclidean distance, the keldysh - sian divergence can be written in terms of '\n",
            " 'a metric @xmath0 <CUR> ] where p is the mean and 02 = the variance. 2.3_ The '\n",
            " 'gaussian distribution is simply another name for the normal distribution : '\n",
            " 'in this book; we use the term Gaussian throughout, although we retain the '\n",
            " 'conventional use of the symbol n to denote this distribution. (2.42 ) there '\n",
            " 'are several distributions which have the following properties : a a _ 1 a '\n",
            " \"mean mode = var s (1)b b b2 ; var e(2)var o(o)v? 202 (p)e't2 dt v2 erf: 1 \"\n",
            " '@(x; p, 0) = 2 [1erf(e')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('2.3_ The Gaussian Distribution The Gaussian; also known as the normal '\n",
            " 'distribution; is a widely used model for the distribution of continuous '\n",
            " 'variables. In the case of a single variable x, the Gaussian distribution can '\n",
            " 'be written in the form 1 N(wlp,o2) = exp (2to2)1/2 (x _ p)? 202 (2.42) where '\n",
            " 'p is the mean and 02 is the variance. Just like the Euclidean distance is a '\n",
            " 'special case of a metric (Section 3.3) , the Kullback-Leibler divergence is '\n",
            " 'a special case of two more general classes of divergences called Bregman '\n",
            " 'divergences and f-divergences: The study of divergences is beyond the scope '\n",
            " 'of this book, and we refer for more details to the recent book by Amari '\n",
            " '(2016), one of the founders of the field of information geometry 6.5 '\n",
            " 'Gaussian Distribution The Gaussian distribution is the most well-studied '\n",
            " 'probability distribution for continuous-valued random variables. It is also '\n",
            " 'referred to as the normal normal distribution distribution. Normal The '\n",
            " 'normal distribution is simply another name for the Gaussian: In this book; '\n",
            " 'we use the term Gaussian throughout, although we retain the conventional use '\n",
            " 'of the symbol N to denote this distribution. For consistency, we shall refer '\n",
            " 'to the normal- gamma distribution as the Gaussian-gamma distribution, and '\n",
            " 'similarly the normal- Wishart is called the Gaussian-Wishart. In particular; '\n",
            " 'we can compute it in terms of the error function (erf: 1 @(x; p, 0) = 2 [1 + '\n",
            " \"erf(e/ v2)] where 2 = (x _ p) /o and (2.45) 2 erf(x) e't2 dt (2.46) The \"\n",
            " 'Gaussian distribution is the most widely used distribution in statistics '\n",
            " 'There are several reasons for this. First, it has two parameters which are '\n",
            " 'easy to interpret; and which capture some of the most basic properties of a '\n",
            " 'distribution, namely its mean and variance. For later reference, we note '\n",
            " 'that the distribution has the following properties: a a _ 1 a mean mode = '\n",
            " 'var S (2.57) b b b2 There are several distributions which are just special '\n",
            " 'cases of the Gamma, which we discuss below: Exponential distributio...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 1 of 9: log likelihood function'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('the most common method for estimating parameters in a parametric model is '\n",
            " 'the maximum likelihood method. this article introduces the gamma '\n",
            " 'distribution, which is a flexible distribution for positive real valued rv '\n",
            " 's, i defined in terms of two parameters, called the shape a > 0 and the rate '\n",
            " 'b 0.7 ba Ga(Tlshape = @,rate = b)Ta-le-Tb t(a)0.3.1 gaussian (normal) '\n",
            " 'distribution has the following properties : a a _ 1 a mean mode = var s '\n",
            " '(2.57)b b b e [ u(y, a)] aeA (5.97) ; var[v] b28) a_ 1 model v] for & > 1 '\n",
            " 'b.29) E[ln -]? (a ) = lnb b3 o) h[r] InT(o)ta-1e-br (B.26) ] b\"ta')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('(c) Use the prior f(p1, p2, 1. Use simulation to find the posterior mean and '\n",
            " 'posterior 90 percent interval for T _ (d) Let p1 1 =p1 p2 p2 = be the '\n",
            " 'log-odds ratio. 1 Gam(rla,b) b\"ta-1e-br (B.26) T(a) Elv] B.27) ; var[v] '\n",
            " 'B.28) a_ 1 modelv] for & > 1 B.29) E[ln -] ? (a) = lnb B.3O) H[r] InT(a) = '\n",
            " '(a = 1)y(a) = lnb + a (B.31) where is the digamma function defined by '\n",
            " '(B.25): The gamma distribution is the conjugate prior for the precision '\n",
            " '(inverse variance) of a univariate Gaussian For @ 2 1 the density is '\n",
            " 'everywhere finite, and the special case of a = 1 is known as the exponential '\n",
            " 'distribution. Gaussian The Gaussian is the most widely used distribution for '\n",
            " 'continuous variables. This property is a useful way to encourage sparsity in '\n",
            " 'a model, as we will see in Section 13.3. 2.4.4 The gamma distribution The '\n",
            " 'gamma distribution is a flexible distribution for positive real valued rV S, '\n",
            " 'I defined in terms of two parameters, called the shape a > 0 and the rate b '\n",
            " '0.7 ba Ga(Tlshape = @, rate = b) Ta-le-Tb T(a) 0. It is (2.55) 6_ '\n",
            " 'Pierre-Simon Laplace (<PHONE>) was French mathematician, who played a key '\n",
            " 'role in creating the field of Bayesian statistics. 9.3 Maximum Likelihood '\n",
            " 'The most common method for estimating parameters in a parametric model is '\n",
            " 'the maximum likelihood method. Let X1, Xn be IID with PDF f (x; 0)_ 9.7 '\n",
            " 'Definition_ The likelihood function is defined by n Ln(0) = II f(X;;0) i=1 '\n",
            " '(9.5) The log-likelihood function is defined by en(0) = log Ln (0) . The '\n",
            " 'likelihood function is just the joint density of the data, except that we '\n",
            " 'treat it is a function of the parameter 0_ Thus, Ln 3 [0, 0). Given a prior '\n",
            " 'f(0) and data Xn = (X1;:.Xn) the posterior density is L(0)f(0) f(0|xn) = '\n",
            " 'where L(0) is the likelihood function and C = f cc)f(o) ae is the '\n",
            " 'normalizing constant. The posterior mean is 0 = f 0L(0) f (0)de f of(1x\"Jdo '\n",
            " '= 404 24. Simulation Methods If 0 (01,..S Ok) is multidimensional, then we '\n",
            " 'might be interested in the posterior for one of the components, 01, say. '\n",
            " 'Since we know p(...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 2 of 9: linear regression model'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "(\"linear regression is the'work horse of statistics and (supervised) machine \"\n",
            " 'learning : when augmented with kernels or other forms of basis function '\n",
            " 'expansion, it can model also non- linear relationships. And when the '\n",
            " 'gaussian output is replaced with a Bernoulli or multinoullis distribution, '\n",
            " 'classification ; as we will see below. in this chapter we discuss linear and '\n",
            " 'logistic regression. then we move on to nonparametric regression where we '\n",
            " 'focus on bayesian inference over the space of ensembles of trees, which '\n",
            " 'tends to work much better. + * ams subject classification 2010 : * primary '\n",
            " '60k35 ; secondary 62h05, 62m25, 82b10.*key words and phrases:*linear '\n",
            " 'regression, multi - output linear models, multiple outcome shrinkage, '\n",
            " 'selection, evidence procedure, bagging, tree learning, adaptive regression '\n",
            " 'trees.**pacs number(s ) : 89.65.gh, 02.30.')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('In Chapters 20 and 21 we discuss nonparametric regression: 13.1 Simple '\n",
            " 'Linear Regression The simplest version of regression is when Xi is simple '\n",
            " '(one-dimensional) and r(x) is assumed to be linear: r(x) = Bo + B1x. 1 The '\n",
            " \"term 'regression' is due to Sir Francis Galton (<PHONE>) who noticed that \"\n",
            " 'tall and short men tend to have sons with heights closer to the mean_ He '\n",
            " 'called this \"regression towards the mean. 210 13. Linear and Logistic '\n",
            " 'Regression 9 4 7 L 2 1 Y 2 7 8 4.0 4.5 5.0 5.5 log light intensity (X) '\n",
            " 'FIGURE 13.1. 2001, p70). The reason is that in PC regression, only the first '\n",
            " 'K (derived) dimensions are retained, and the remaining D K dimensions are '\n",
            " 'entirely ignored. Linear regression 7.1 Introduction Linear regression is '\n",
            " \"the 'work horse of statistics and (supervised) machine learning: When \"\n",
            " 'augmented with kernels or other forms of basis function expansion, it can '\n",
            " 'model also non- linear relationships. And when the Gaussian output is '\n",
            " 'replaced with a Bernoulli or multinoulli distribution, it can be used for '\n",
            " 'classification; as we will see below. 84 3 Linear Methods for Regression 3.7 '\n",
            " 'Multiple Outcome Shrinkage and Selection As noted in Section 3.2.4, the '\n",
            " 'least squares estimates in a multiple-output linear model are simply the '\n",
            " 'individual least squares estimates for each of the outputs. To apply '\n",
            " 'selection and shrinkage methods in the multiple output case one could apply '\n",
            " 'a univariate technique individually to each outcome Or si- multaneously to '\n",
            " 'all outcomes. This is known as the evidence procedure (MacKay 1995b).3 See '\n",
            " 'Section 13.7.4 for the algorithmic details: The evidence procedure provides '\n",
            " 'an alternative to using cross validation: For example, in Figure 7.13(b), we '\n",
            " 'plot the log marginal likelihood for different values of Q, as well as the '\n",
            " 'maximum value found by the optimizer: We see that; in this example, we get '\n",
            " 'the same result as 5-CV; shown in Figure 7.13(a). 2011)). Bagging is a '\n",
            " 'frequentist concept: It is also possible to adopt a Bayesian approach to '\n",
            " 'learning trees ...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 3 of 9: graphical models'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('graphical models can be viewed as a model combination method in which only '\n",
            " 'one model is responsible for making predictions at ay given point in input '\n",
            " \"space. 1-Tp[ ; (n _ 1)1 4_Y 2 2( VzF(n+ 3) (1_ M)i '4+1Y(1 + 1 -M) <458 13 \"\n",
            " 'Graphical modeling * we can represent multi-stage (Bayesian ) decision '\n",
            " 'problems by using a graphical notation known as the decision diagram or an '\n",
            " 'influence diagram (Howard and Matheson 1981; Kjaerulff and Madsen 2008). '\n",
            " 'this extends directed graphical systems by adding decision nodes ( also '\n",
            " 'called ac- tion nodes ) represented by rectangles, and utility node (also '\n",
            " 'called value nodes), representing by diamonds. that is, we consider a '\n",
            " 'triangular array of observations (e.g. observe outcomes, perform actions, '\n",
            " 'move on to the next oil well, and continue drilling (and polluting) in this')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('Simulated data in the plane, clustered into three classes (repre- sented by '\n",
            " 'orange, blue and green) by the K-means clustering algorithm that at each '\n",
            " 'level of the hierarchy, clusters within the same group are more similar to '\n",
            " 'each other than those in different groups: Cluster analysis is also used to '\n",
            " 'form descriptive statistics to ascertain whether or not the data consists of '\n",
            " 'a set distinct subgroups, each group representing objects with substantially '\n",
            " 'different properties. This latter goal requires an assessment of the degree '\n",
            " 'of difference between the objects as- signed to the respective clusters. '\n",
            " '14.4 Tree-based Models There are various simple, but widely used, models '\n",
            " 'that work by partitioning the input space into cuboid regions, whose edges '\n",
            " 'are aligned with the axes, and then assigning a simple model (for example, a '\n",
            " 'constant) to each region: They can be viewed as a model combination method '\n",
            " 'in which only one model is responsible for making predictions at ay given '\n",
            " 'point in input space. 1-Tp[ ; (n _ 1)I (n) 1 4_Y 2 2( VzF(n+ 3) (1_ M)i '\n",
            " '\\'4+1Y(1 + 1-M) <458 13 Graphical modeling 4_Y2 4 _ <C1(n = 1)( \\'4+1)\" = '\n",
            " 'C1(n _ 1) exp (n _ 3) logl 4+ Y where 0 < C1 C depends o M only, but not o p '\n",
            " 'Or Y By invoking (13.25), the proof is complete (note that the number n in '\n",
            " 'the proof corresponds to the actual sample size n 1) Lemma 13.1 can be '\n",
            " 'easily extended to partial correlations, as shown by Fisher (1924), using '\n",
            " 'projections for Gaussian distributions. Proposition 13.5. The process of '\n",
            " 'selecting a specific model, given a new input X, can be described by a '\n",
            " 'sequential decision making process corresponding to the traversal of a '\n",
            " 'binary tree (one that splits into two branches at each node) Here we focus '\n",
            " 'on a particular tree-based framework called classification and regression '\n",
            " 'trees, or CART (Breiman et al., 1984), although there are many other '\n",
            " 'variants going by such names as ID3 and C4.5 (Quinlan, 1986; Quinlan; 1993). '\n",
            " 'Figure 14.5 shows an illustration of a recursive binary partitioning of ...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 4 of 9: latent variables'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('many time series fluctuate periodically as illustrated in Figure 18.7). '\n",
            " 'States 3 and 4 are transient because of the path 3 7 4 - 6 and once you hit '\n",
            " 'state 6 you can not return to 3 o1 4. since pii(1) > 0, all the states are '\n",
            " 'aperiodic. (b) Sample output; for c0 bo = 0, Co = (1,2,3,1), with a period '\n",
            " 'of 4 Color code as in ssmTimeSeriesSimple.com. hence e [ytly1:t-1] = @o + '\n",
            " 'tbo : this is thus a generalization of an classic constant linear trend '\n",
            " 'model ; an example of which is shown in the black line of Figure 11.6(b ). '\n",
            " 'such models form the basis of probabilistic matrix factorization, discussed '\n",
            " 'in section 27.4.3 by using the joint distribution, we can define whether two '\n",
            " 'events or the two red variables are independent, so')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('(b) Sample output; for c0 bo = 0, Co = (1,1,1), with a period of 4 Color '\n",
            " 'code as in Figure 18.5. Figure generated by ssmTimeSeriesSimple. hence E '\n",
            " '[ytly1:t-1] = @o +tbo: This is thus a generalization of the classic constant '\n",
            " 'linear trend model; an example of which is shown in the black line of Figure '\n",
            " '18.6(b). 18.2.4.3 Seasonality Many time series fluctuate periodically as '\n",
            " 'illustrated in Figure 18.7). States 3 and 4 are transient because of the '\n",
            " 'path 3 7 4 - 6 and once you hit state 6 you cannot return to 3 O1 4. Since '\n",
            " 'Pii(1) > 0, all the states are aperiodic. In summary; 3 and 4 are transient '\n",
            " 'while 1, 2, 5, and 6 are ergodic. 23.29 Example (Hardy-Weinberg) _ Here is a '\n",
            " 'famous example from genetics. Suppose a gene can be type Aor type a. There '\n",
            " 'are three types of people (called genotypes): AA, Aa, and aa. Let (p, Q,r) '\n",
            " 'denote the fraction of people of each genotype. survdata () function, which '\n",
            " 'is part of the coxed library: Our simulated data will rep- resent the '\n",
            " 'observed wait times (in seconds) for 2,000 customers who have phoned a call '\n",
            " 'center. In this context, censoring occurs if a customer hangs up before his '\n",
            " 'or her call is answered. There are three covariates: Operators (the number '\n",
            " 'of call center operators available at the time of the call, which can range '\n",
            " 'from 5 to 15), Center (either A, B; or C), and Tine of day (Morning; '\n",
            " 'Afternoon; or Evening). The rows in the figure show the resulting archetypes '\n",
            " 'from three runs, specifying two, three and four archetypes, respectively: '\n",
            " 'fluences, and other driving forces that may be hard to identify or measure. '\n",
            " 'Factor analysis is a classical technique developed in the statistical liter- '\n",
            " 'ature that aims to identify these latent sources. Factor analysis models are '\n",
            " 'typically wed to Gaussian distributions; which has to some extent hin- dered '\n",
            " 'their usefulness. But when we learn that you forgot to set your watch '\n",
            " 'properly; we would lower the chance that your friend was abducted: Hence, '\n",
            " 'P(Aliens yes] Late yes) P(Aliens yes Late yes; Watch 3 n...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 5 of 9: conditional mixture models'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('in this chapter, we discuss probabilistic mixtures of unconditional density '\n",
            " 'models such as gaussians and conditional distributions over continuous vari- '\n",
            " 'ables. a further generalization to allow the mixing coeffi- cients also to '\n",
            " 'depend on the inputs then we obtain a hierarchical mixture of experts model '\n",
            " 'which is also known as latent class analysis (Lazarsfeld and Henry, 1968; '\n",
            " 'McLachlan and Peel, 2000 ). by introducing a latent indicator variable 2, '\n",
            " 'where z = k means that 0 comes from mixture component k The prior has the '\n",
            " 'form p(0)@xmath0p(z @ k)#1 <CUR> ] # 1<CUR> ] \\\\{2.5}\\\\ { 3.3 } \\\\ { 4.6}}\\\\ '\n",
            " '{ 5.8}(\\\\ { 6.7})\\\\ { 7.9})/{\\\\{ 8.4}}\\\\({ 9.17),\\\\[9.19)]\\\\{ 10.11}\\\\{ 11.')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('9.3.3 Mixtures f Bernoulli distributions So far in this chapter; we have '\n",
            " 'focussed on distributions over continuous vari- ables described by mixtures '\n",
            " 'of Gaussians. As a further example of mixture mod- elling, and to illustrate '\n",
            " 'the EM algorithm in a different context; we now discuss mix- tures of '\n",
            " 'discrete binary variables described by Bernoulli distributions. This model '\n",
            " 'is also known as latent class analysis (Lazarsfeld and Henry, 1968; '\n",
            " 'McLachlan and Peel, 2000). The usual approach is to perform exhaustive '\n",
            " 'search over all candidate values of K. However; sometimes we can set the '\n",
            " 'model to its maximal size, and then rely on the power of the Bayesian Occams '\n",
            " 'razor to \"kill off\" unwanted components An example of this will be shown in '\n",
            " 'Section 21.6.1.6, when we discuss variational Bayes. An alternative approach '\n",
            " 'is to perform stochastic sampling in the space of models: Traditional '\n",
            " 'approaches, such as (Green 1998, 2003; Lunn et al. SAMPLING METHODS Figure '\n",
            " '11.12 The Gibbs sampling method requires samples to be drawn from the '\n",
            " 'conditional distribution of a variable condi- tioned on the remaining '\n",
            " 'variables: For graphical models, this conditional distribution is a function '\n",
            " 'only of the states of the nodes in the Markov blanket. Choosing these '\n",
            " 'parameters is an example of model selection: We discuss some approaches '\n",
            " 'below. 11.5.1 Model selection for probabilistic models The optimal Bayesian '\n",
            " 'approach, discussed in Section 5.3, is to pick the model with the largest '\n",
            " 'marginal likelihood, K 3 argmaxk P(DIK) There are two problems with this. '\n",
            " 'First, evaluating the marginal likelihood for LVMs is quite difficult In '\n",
            " 'practice, simple approximations, such as BIC, can be used (see e.g;, (Fraley '\n",
            " 'and Raftery 2002)). An alternative way to motivate the hierarchical mixture '\n",
            " 'of experts model is to start with a standard probabilistic mixtures of '\n",
            " 'unconditional density models such as Gaussians and replace the component '\n",
            " 'densities with conditional distributions. Here we consider mixtures of '\n",
            " 'linear regression ...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 6 of 9: mixture density network'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('the most widely used mixture model is the mixture of Gaussians ( mOG ), also '\n",
            " 'called a gaussian mixture or GMM. this model extends the flexibility of '\n",
            " 'linear models to include more com- plex (e-g-, multimodal) predictive '\n",
            " 'distributions, they are still very limited : we can further increase the '\n",
            " 'capability of such models by allowing the mixing coefficients themselves to '\n",
            " 'be functions of the input variable, so that k tk (x)pr(tlx). k=l pttxt '\n",
            " '(14.53)this is a synthetic data set that arose out of a project aimed at '\n",
            " 'measuring nonin- vasively the proportions of oil, water; and gas in North '\n",
            " 'oil transfer pipelines. as a further example of mixture mod- elling, and to '\n",
            " 'illustrate the EM algorithm in a different context ; we now discuss mix- '\n",
            " 'tures of discrete binary variables described by Bernoulli distributions. '\n",
            " 'after describing the gating functions and the individual component densities '\n",
            " 'from @')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('From each city we draw ni people and observe how many people Yi have a '\n",
            " 'disease. Thus; Yi Binomial(ni, Pi)- We are allowing for different disease '\n",
            " 'rates in different cities. This problem can be resolved by extending the '\n",
            " 'model to allow the mixture coefficients themselves to be functions of 1 , '\n",
            " 'leading to models such as the mixture density networks discussed in Section '\n",
            " '5.6, and hierarchical mixture of experts discussed in Section 14.5.3. 670 '\n",
            " '14. Right: median (thick line) and quantiles (dashed lines) over 100 '\n",
            " 'different starting values Source: Figure 2.27 of (Sudderth 2006). Used with '\n",
            " 'kind permission of Erik Sudderth: A N C2 C3 C CN s(1,- s(2 , [s(3_ s(i;) '\n",
            " 'IS(N;) Figure 25.9 Factor graphs for affinity propagation. Circles are '\n",
            " 'variables, squares are factors. Each Ci node has N possible states. 0.8 0.2 '\n",
            " '0.3 0.6 0.8 0.9 (a) (b) Figure 11.3 A mixture of 3 Gaussians in 2d. (a) We '\n",
            " 'show the contours of constant probability for each component in the mixture. '\n",
            " '() A surface plot of the overall density Based on Figure 2.23 of (Bishop '\n",
            " '2006a). Figure generated by mixGaussPlotDemo. 11.2.1 Mixtures of Gaussians '\n",
            " 'The most widely used mixture model is the mixture of Gaussians (MOG), also '\n",
            " 'called a Gaussian mixture model or GMM In this model, each base distribution '\n",
            " 'in the mixture is a multivariate Gaussian with mean pk and covariance matrix '\n",
            " 'Zk. Compute all pairwise dis- similarities between the observations in '\n",
            " 'cluster A and the Average observations in cluster B and record the average '\n",
            " 'of these dissimilarities. Dissimilarity between the centroid for cluster A '\n",
            " '(a mean Centroid vector of length p) and the centroid for cluster B. '\n",
            " 'Centroid linkage can result in undesirable inversions. TABLE 12.3. A summary '\n",
            " 'of the four most commonly-used types of linkage in hierarchical clustering: '\n",
            " 'single linkage are most popular among statisticians. Average and complete '\n",
            " 'linkage are generally preferred over single linkage, as they tend to yield '\n",
            " 'more balanced dendrograms. Application of mixtures to the heart disease ...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 7 of 9: log marginal likelihood'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('the bayesian approach to probabilistic models is to pick the model with the '\n",
            " 'largest marginal likelihood, k 3 argmaxk p(DIK). empirical bayes violates '\n",
            " 'the principle that the prior should be chosen independently of the data.11.2 '\n",
            " 'comparing risk functions 195 3 2 r(0,02)p(tlxo)y(a) = lnb + a (26.13) where '\n",
            " 'nt,pa(t) are the counts (sufficient statistics ) for node t and its parents, '\n",
            " 'and score is defined in Equation 26.28.4.1 in section 5.3 we discuss some '\n",
            " 'methods for estimating parameters in a parametric model, such as maximum '\n",
            " 'likelihood, maximum likelihood method, posterior mode, plug-in '\n",
            " 'approximation, etc. 11.5 bibliographic Remarks 9.7 Definition_ the '\n",
            " 'log-likelihood function is just the joint densities of eigenvalues ; except '\n",
            " 'that we treat it is a function of')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "(\"Since we know p(OlD) = Beta(Ola' , b ), where a = a + N1 and b' = b + No, we \"\n",
            " \"know the normalization constant of the posterior is B(a' , 6'). Choosing \"\n",
            " 'these parameters is an example of model selection: We discuss some '\n",
            " 'approaches below. 11.5.1 Model selection for probabilistic models The '\n",
            " 'optimal Bayesian approach, discussed in Section 5.3, is to pick the model '\n",
            " 'with the largest marginal likelihood, K 3 argmaxk P(DIK) There are two '\n",
            " 'problems with this. First, evaluating the marginal likelihood for LVMs is '\n",
            " 'quite difficult In practice, simple approximations, such as BIC, can be used '\n",
            " '(see e.g;, (Fraley and Raftery 2002)). This overall approach is called '\n",
            " 'empirical Bayes (EB) or type-II maximum likelihood: In machine learning, it '\n",
            " 'is sometimes called the evidence procedure Empirical Bayes violates the '\n",
            " 'principle that the prior should be chosen independently of the data. (c) Use '\n",
            " 'the prior f(p1, p2, 1. Use simulation to find the posterior mean and '\n",
            " 'posterior 90 percent interval for T _ (d) Let p1 1 =p1 p2 p2 = be the '\n",
            " 'log-odds ratio. Typically the posterior mean Or median is the most '\n",
            " 'appropriate choice for a real- valued quantity, and the vector of posterior '\n",
            " 'marginals is the best choice for a discrete quantity: However, the posterior '\n",
            " 'mode, aka the MAP estimate, is the most popular choice because it reduces to '\n",
            " 'an optimization problem, for which efficient algorithms often exist. '\n",
            " 'Unfortunately this integral is intractable The simplest approximation is the '\n",
            " 'plug-in approximation; which, in the binary case, takes the form ply = lx,D) '\n",
            " 'ply = 1x,E [w]) (8.60) where E [w] is the posterior mean. A 95 percent '\n",
            " 'posterior interval can be obtained by numerically finding a and b such that '\n",
            " 'f(plz\" ) dp = .95 . Suppose that instead of a uniform prior, we use the '\n",
            " 'prior p ~ Beta(a, 8)_ If you repeat the calculations above, you will see '\n",
            " 'that plz\" Beta(a + <CUR>,8 + 11.2 The Bayesian Method 179 n _ 8) . The flat '\n",
            " 'prior is just the special case with a = B = 1. Again, we can compute the '\n",
            " 'marginal likelihood i...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 8 of 9: posterior probability'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('maximum posterior is our internal belief state about the world : the way to '\n",
            " 'test if our beliefs are justified is to use them to predict objectively '\n",
            " 'observable quantities ( this is the basis of the scientific method ) '\n",
            " 'specifically the posterior predictive distribution in this context is given '\n",
            " 'by p( e cld) = pky = 1/w,h)p(dlz) h (3.8) we can now determine w by finding '\n",
            " 'the most probable value of w given the data, in other words by maximizing a '\n",
            " 'posterior distribution or simply MAP. 2.2.3 Conditional probability '\n",
            " 'Combining the definition of conditional probability with the product and sum '\n",
            " 'rules yields bayes rule, also called Bayes Theorem? Solving this recurrence '\n",
            " 'yields the following sequence: 1, 3, 25, 543, 29281, <PHONE>, etc2. we '\n",
            " 'review approximate methods, some of which we briefly review below : 26.4.1 '\n",
            " 'approximating the mode ; 15.6')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "(\"Since we know p(OlD) = Beta(Ola' , b ), where a = a + N1 and b' = b + No, we \"\n",
            " \"know the normalization constant of the posterior is B(a' , 6'). Figure \"\n",
            " 'generated by numbersGame_ 3.2.4 Posterior predictive distribution The '\n",
            " 'posterior is our internal belief state about the world: The way to test if '\n",
            " 'our beliefs are justified is to use them to predict objectively observable '\n",
            " 'quantities (this is the basis of the scientific method) Specifically the '\n",
            " 'posterior predictive distribution in this context is given by p( e Cld) = '\n",
            " 'pky = 1/w,h)p(hld) h (3.8) This is just a weighted average of the '\n",
            " 'predictions of each individual hypothesis and is called Bayes model '\n",
            " 'averaging (Hoeting et al. 1999). This is illustrated in Figure 3.4. (1.66) '\n",
            " 'We can now determine w by finding the most probable value of w given the '\n",
            " 'data, in other words by maximizing the posterior distribution: This '\n",
            " 'technique is called maximum posterior, or simply MAP. 2.2.2.3 Conditional '\n",
            " 'probability We define the conditional probability of event A, given that '\n",
            " 'event B is true, as follows: p(A,B) P(AIB) if p(B) > 0 (2.6) p(B) 2.2.3 '\n",
            " 'Bayes rule Combining the definition of conditional probability with the '\n",
            " 'product and sum rules yields Bayes rule, also called Bayes Theorem? Solving '\n",
            " 'this recurrence yields the following sequence: 1, 3, 25, 543, 29281, '\n",
            " '<PHONE>, etc2 In view of the enormous size of the hypothesis space, we are '\n",
            " 'generally forced to use approximate methods, some of which we review below: '\n",
            " '26.4.3.1 Approximating the mode of the posterior We can use dynamic '\n",
            " 'programming to find the globally optimal MAP DAG (up to Markov equiv- '\n",
            " 'alence) (Koivisto and Sood 2004; Silander and Myllmaki 2006). This justifies '\n",
            " 'the common practice of quoting a credible interval in the form of p6 = 20, '\n",
            " 'where /6 represents the posterior mean; 0 represents the posterior standard '\n",
            " 'deviation, and 2 is a good approximation to 1.96. Of course, the posterior '\n",
            " 'is not always Gaussian For example, in our coin example, if we use a uniform '\n",
            " 'prior and we observe Ni 47 hea...')\n"
          ]
        },
        {
          "data": {
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/javascript": [
              "download(\"download_90a4a64a-bd15-4f8c-adb4-d2fc3b3e3dd6\", \"SummarySearchTerms_intro to ML_exported_terms from pattern recog textbook_allenailedlarge16384arxiv08.19.2021, 2307.txt\", 8291)"
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "metadata": {
            "tags": null
          },
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "Completed Summary Search -  08.19.2021, 23-07\n",
            "\n",
            "\n",
            "\n",
            "\n",
            " Moving to next term list 1629414422.756508 \n",
            "\n",
            "\n",
            "\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "636c5a35860042c59953c7d317190638",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "getting defs for search_terms...:   0%|          | 0/7 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "output_type": "stream",
          "text": [
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 0 of 7: confusion matrix tolerance'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('the goal of blind- source separation is to identify the constituent parts of '\n",
            " 'the mixed signals. as discussed previously in the context of maximum '\n",
            " 'likelihood estimation for pPCA, the original PCA solution is invariant to '\n",
            " 'any rotation. therefore, it can identify a lower - dimensional subspace in '\n",
            " 'which the sig- nals live, but not the signals themselves (Murphy, 2012 ). '\n",
            " 'actually is symmetric right the matrix of my eyes the commotion of do you '\n",
            " 'have to think this is all in one dimension right of your mind is the only '\n",
            " \"work for my dimension so depending on your training points you'll get the \"\n",
            " 'matrices that will just have values of. imagine you are in a busy train '\n",
            " 'station with many people talking your ears play the role of microphones, and '\n",
            " 'they linearly mix different speech signals in an area where there are '\n",
            " 'hundreds of people who are trying to speak at the same time. according to '\n",
            " 'tij12.3 Missing Values and Matrix Completion 513 4')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('The Bonferroni correction is by far the best-known and most commonly- used '\n",
            " 'multiplicity correction in all of statistics. Its ubiquity is due in large '\n",
            " 'part to the fact that it is very easy to understand and simple to implement, '\n",
            " 'and also from the fact that it successfully controls Type I error regardless '\n",
            " 'of whether the m hypothesis tests are independent. Figure 12.5 shows that '\n",
            " 'the recovery of the missing elements 5This algorithm is referred to as '\n",
            " '\"Hard-Impute\" in Mazumder, Hastie, and Tibshi- rani (2010) \"Spectral '\n",
            " 'regularization algorithms for learning large incomplete matrices\" published '\n",
            " 'in Journal of Machine Learning Research; pages 2287 2322_ 6Each iteration of '\n",
            " 'Step 2 of this algorithm decreases the objective (12.14). However , the '\n",
            " 'algorithm is not guaranteed to achieve the global optimum of (12.12). '\n",
            " 'Tij12.3 Missing Values and Matrix Completion 513 4 MD 2 AK TN TX 3 9 8 ] 3 '\n",
            " 'NY WA MO VA_ OR WY MA MT MN ? (a) shows the belief state at frame 1 The '\n",
            " 'system has had to resample 5 times to keep the effective sample size above '\n",
            " 'the threshold of 150; () shows the belief state at frame 251; the red lines '\n",
            " 'show the estimated location of the center of the object over the last 250 '\n",
            " 'frames (c) shows that the system can handle visual clutter; as long as it '\n",
            " 'does not have the same color as the target object (d) shows that the system '\n",
            " 'is confused between the grey of the helicopter and the grey of the building: '\n",
            " 'The posterior is bimodal. Imagine you are in a busy train station with many '\n",
            " 'people talking Your ears play the role of microphones, and they linearly mix '\n",
            " 'different speech signals in the train station. The goal of blind- source '\n",
            " 'separation is to identify the constituent parts of the mixed signals As '\n",
            " 'discussed previously in the context of maximum likelihood estimation for '\n",
            " 'PPCA, the original PCA solution is invariant to any rotation. Therefore, PCA '\n",
            " 'can identify the best lower-dimensional subspace in which the sig- nals '\n",
            " 'live, but not the signals themselves (Murphy, 2012). Actually is sym...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 1 of 7: empirical risk minimizer'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('bayesian methods run into problems when the parameter space is high '\n",
            " 'dimensional. to combine prior beliefs with data in a principled way; use '\n",
            " 'Bayesian in- ference. to construct procedures with guaranteed long run '\n",
            " 'performance, such as confidence intervals ; use frequentist methods. 12.8 '\n",
            " 'bibliographic Remarks some references on decision theory include Carlin and '\n",
            " 'Louis (1996), Gel- man et al. (1995 ), Lee (1997), Robert (1994)@xmath0, and '\n",
            " 'Schervish ( 1995). 1.2 comparing risk functions 195 3 2 r(0,02)r(03 0s k _ 2 '\n",
            " '+ 1 _ Xi X2 05 (X) (12.12) and (2)+ max{z,0}. this estimator shrinks the Xi '\n",
            " 's towards 0_ The message is that, when estimating many parameters, there is '\n",
            " 'great value in shrinking the estimates. This observation plays an important '\n",
            " 'role in modern nonparametric function')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "(\"Here are some better estimates of risk: Mallow's Cp statistic is defined by \"\n",
            " 'R(S) = RtrC (8) + 2812 (13.26) where |S| denotes the number of terms in S '\n",
            " 'and 02 is the estimate of 02 obtained from the full model (with all '\n",
            " 'covariates in the model). This is simply the training error plus a bias '\n",
            " 'correction: This estimate is named in honor of Colin Mallows who invented '\n",
            " 'it. The first term in (13.26) measures the fit of the model while the second '\n",
            " 'measure the complexity of the model. To combine prior beliefs with data in a '\n",
            " 'principled way; use Bayesian in- ference. To construct procedures with '\n",
            " 'guaranteed long run performance, such as confidence intervals; use '\n",
            " 'frequentist methods. Generally; Bayesian methods run into problems when the '\n",
            " 'parameter space is high dimensional. In particu- lar , 95 percent posterior '\n",
            " 'intervals need not contain the true value 95 percent of the time (in the '\n",
            " 'frequency sense) - 11.10 Bibliographic Remarks Some references on Bayesian '\n",
            " 'inference include Carlin and Louis (1996) , Gel- man et al. (1995) , Lee '\n",
            " '(1997) , Robert (1994) , and Schervish (1995) . In frequentist decision '\n",
            " 'theory, the average loss is called the risk (see Section 6.3), so this '\n",
            " 'overall approach is called empirical risk minimization or ERM (see Section '\n",
            " '6.5 for details). This overall approach is called empirical Bayes (EB) or '\n",
            " 'type-II maximum likelihood: In machine learning, it is sometimes called the '\n",
            " 'evidence procedure Empirical Bayes violates the principle that the prior '\n",
            " 'should be chosen independently of the data. It can be shown that the '\n",
            " 'James-Stein estimator 0S has smaller risk; where 0s (03 0s k _ 2 + 1 _ Xi X2 '\n",
            " \"05 (X) (12.12) and (2)+ max{z, 0}. This estimator shrinks the Xi's towards \"\n",
            " '0_ The message is that, when estimating many parameters, there is great '\n",
            " 'value in shrinking the estimates. This observation plays an important role '\n",
            " 'in modern nonparametric function estimation. 12.8 Bibliographic Remarks '\n",
            " 'Aspects of decision theory can be found in Casella and Berger (2002) , '\n",
            " 'Berger (1985), Ferg...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 2 of 7: component analysis pca'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('principal components analysis (PCA ) is a technique that is widely used for '\n",
            " 'appli- cations such as dimensionality reduction, lossy data compression, '\n",
            " 'feature extraction, and data visualization. it is also known as the '\n",
            " 'Karhunen-Loeve trans - form there are tWO commonly used definitions of PCA '\n",
            " 'that give rise to the same algorithm. we shall expand on this concept in '\n",
            " 'Chapter 10.12.2 when we constrain the orthogonal projection of the data onto '\n",
            " 'a lower dimensional linear space called the principal subspace such that the '\n",
            " 'variance of projected data is maximized (Hotelling: [933 ] ). 12.1 Principal '\n",
            " 'components regression ( PCR ) approach involves construct- principal ing the '\n",
            " 'first m principal component; Z1,ZM ; and then using these components '\n",
            " 'components as predictors in a linear regression model which is fit u s- '\n",
            " 'regression ing least squares.18.6.3 Dimension Reduction Methods 257 8 8 2 8 '\n",
            " '@xmath')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('Section 12.+ 121 Principal Component Analysis Principal component analysis, '\n",
            " 'or PCA, is a technique that is widely used for appli- cations such aS '\n",
            " 'dimensionality reduction, lossy data compression , feature extraction, and '\n",
            " 'data visualization (Jolliffe. 2002). It is also known as the Karhunen-Loeve '\n",
            " 'trans - form There are tWO commonly used definitions of PCA that give rise '\n",
            " 'to the same algorithm. PCA can be defined as the orthogonal projection of '\n",
            " 'the data onto a lower dimensional linear space, known as the principal '\n",
            " 'subspace. such that the variance of the projected data is maximized '\n",
            " '(Hotelling: [933). 12.2 Principal components analysis (PCA) Consider the FA '\n",
            " 'model where we constrain Q = 02I, and W to be orthonormal: It can be shown '\n",
            " '(Tipping and Bishop 1999) that; as 02 0, this model reduces to classical '\n",
            " '(non- probabilistic) principal components analysis PCA), also known as the '\n",
            " 'Karhunen Loeve transform: The version where 02 0 is known as probabilistic '\n",
            " 'PCA (PPCA) (Tipping and Bishop 1999), or sensible PCA (Roweis 1997). (An '\n",
            " 'equivalent result was derived independently, from a different perspective, '\n",
            " 'in (Moghaddam and Pentland 1995).) P514 12_ Unsupervised Learning C) 9 M L 1 '\n",
            " '2 ] 9 9 3 -2 ~1 2 1 0 8 L 2 3 2 8 10 12 True First Principal Component True '\n",
            " 'PC Variances FIGURE 12.6 As described in the tert, in each of 100 trials, we '\n",
            " 'left out 20 elements of the USArrests dataset. In each trial, we applied '\n",
            " 'Algorithm 12.1 with M = 1 to impute the missing elements and compute the '\n",
            " 'principal components. > X < - data matrix ( scale ( USArrests ) ) > pcob <- '\n",
            " 'prcomp ( X) > summary (pcob ) Importance of component s : PC 1 PC2 PC3 PC4 '\n",
            " 'Standard deviation 1 . 5749 0 9949 0 59713 0 41645 Proportion of Variance 0 '\n",
            " '6201 0 . 2474 0 . 08914 0 . 04336 Cumulative Proportion 0 6201 0 . 8675 0 . '\n",
            " '95664 1. 00000 We see that the first principal component explains 62% of the '\n",
            " 'variance. They would successively maximize variance, subject to the '\n",
            " 'constraint of being uncorrelated with the preceding components The Pr...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 3 of 7: likelihood estimate mle'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('bayesian credible intervals for online inference in state space models are '\n",
            " 'compared to frequentist confidence intervals using the Boyen-Koller '\n",
            " 'algorithm. we discuss how to pick the \"right\" value of k. 1.4.8 model '\n",
            " 'selection for probabilistic models ( e.g\" linear or logistic regression '\n",
            " 'models with different degree polynomials, or kepler - nelson - robertson - '\n",
            " 'walker ( knn ) classifiers that have different values of @xmath0 ) this is '\n",
            " 'an example of choice of parameters : it is a natural approach to compute the '\n",
            " 'misclassification rate on 10.5.10.1 SAT stands for \"Scholastic Aptitude '\n",
            " 'Test\". 2.3.2 gaussian approximation for offline inference ; and then '\n",
            " 'computing the posterior marginals this can be measured using a 100(1 _ a)% '\n",
            " 'credible interval, c = (e,u) (standing for lower and upper ) which contains '\n",
            " '1')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('Below we discuss how to pick the \"right\" value of K. 1.4.8 Model selection '\n",
            " 'When we have a variety of models of different complexity (e.g\" linear or '\n",
            " 'logistic regression models with different degree polynomials, or KNN '\n",
            " 'classifiers with different values of K) how should we pick the right one? A '\n",
            " 'natural approach is to compute the misclassification rate on 10. SAT stands '\n",
            " 'for \"Scholastic Aptitude Test\". This is a standardized test for college '\n",
            " 'admissions used in the United States (the data in this example is from '\n",
            " '(ohnson and Albert 1999, p87)). State space models This can be computed by '\n",
            " 'performing a predict-update step using the factored prior; and then '\n",
            " 'computing the posterior marginals This is known as the Boyen-Koller '\n",
            " 'algorithm, named after the authors of (Boyen and Koller 1998), who '\n",
            " 'demonstrated that the error incurred by this series of repeated '\n",
            " 'approximations remains bounded (under certain assumptions about the '\n",
            " 'stochasticity of the system) 18.5.3.2 Gaussian approximation for online '\n",
            " 'inference in GLMs Now suppose q(0t) = IIP-1 N(;luut j, Tt,j), where Tt,j is '\n",
            " 'the variance. This can be measured using a 100(1 _ a)% credible interval, '\n",
            " 'which is a (contiguous) region C = (e,u) (standing for lower and upper) '\n",
            " 'which contains 1 7 a of the posterior probability mass, i,e, Ca(D) = (C,u) : '\n",
            " 'P(e < 0 < uD) = 1 = & (5.8) There may be many such intervals, so we choose '\n",
            " 'one such that there is (1 _ 0)/2 mass in each tail; this is called a central '\n",
            " 'interval 5.2. The method uses a simple linear motion model for the centroid '\n",
            " 'of the object; and a color histogram for the likelihood model; using '\n",
            " 'Bhattacharya distance to compare histograms. The proposal distribution is '\n",
            " 'obtained by sampling from the likelihood See (Nummiaro et al. 2003) for '\n",
            " 'further details. 23.5. We find the 95% posterior credible interval is '\n",
            " '(0.3749,0.5673) (see betaCredibleInt for the one line of Matlab code we used '\n",
            " \"to compute this). If we don't know the functional form, but we can draw \"\n",
            " 'samples from the posterior; then we can use a...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 4 of 7: pca'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('the goal of this paper is to provide a pedagogical introduction to principal '\n",
            " 'components analysis ( pca ) from 20-newsgroup data. there are 9 covariates : '\n",
            " 'systolic blood pressure, cumulative tobacco (kg), ldl (low density '\n",
            " 'lipoprotein choles- terol), adiposity; famhist (family history of heart '\n",
            " 'disease ), typea (type-A be- havior) ; obesity; alcohol (current alcohol '\n",
            " 'consumption ) ; and age. prostate specific antigen (PSA ) and a number of '\n",
            " 'clinical measures, in 97 men who were about to receive a radical '\n",
            " 'prostatectomy: we include qualitative predictors such as Shelveloc, an in- '\n",
            " 'dicator of the quality of shelving location that is, the space within a '\n",
            " 'store in which the car seat is displayed at each location_ the pre- dictor '\n",
            " 'has o three possible values: Bad, Medium, and Good. here is an example that '\n",
            " 'was one of')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('Learning DAG structures 921 evidence case course question msg drive '\n",
            " 'christian shuttle disk government religion jesus disease mission space Jews '\n",
            " 'engine patients orbit games program rights power bible honda computer bmw '\n",
            " 'medicine earth solar season launch technology dealer sclence moon system '\n",
            " 'team satellite files problem studies mar players version hockev windows '\n",
            " 'israel university_ puck baseball won email memory president state research '\n",
            " 'league fans phone format video mac children world cancer number Image data '\n",
            " 'driver software water health food aids insurance doctor card help server '\n",
            " 'graphics tamin display Figure 26.8 A locally optimal DAG learned from the '\n",
            " '20-newsgroup data. prostate specific antigen (PSA) and a number of clinical '\n",
            " 'measures, in 97 men who were about to receive a radical prostatectomy: The '\n",
            " 'goal is to predict the log of PSA (1psa) from a number of measure- ments '\n",
            " 'including 1og cancer volume (1cavol) , 1og prostate weight lweight, age, log '\n",
            " 'of benign prostatic hyperplasia amount Ibph, seminal vesicle in- vasion Svi, '\n",
            " '1og of capsular penetration lcp; Gleason score gleason; and percent of '\n",
            " 'Gleason scores 4 Or 5 pgg45. 81 124 113 13 501 72 ShelveLoc Age Education '\n",
            " 'Urban US 1 Bad 42 17 Yes Yes 2 Good 65 10 Yes Yes 3 Medium 59 12 Yes Yes 4 '\n",
            " 'Medium 55 14 Yes Yes 5 Bad 38 13 Yes No 6 Bad 78 16 No Yes The Carseats data '\n",
            " 'includes qualitative predictors such as Shelveloc, an in- dicator of the '\n",
            " 'quality of the shelving location that is, the space within a store in which '\n",
            " 'the car seat is displayed at each location_ The pre- dictor Shelveloc takes '\n",
            " 'o three possible values: Bad, Medium, and Good. We include the examples for '\n",
            " 'their pedagogical value but we do want to sound a note of caution about '\n",
            " 'interpreting the results with some skepticism _ 8.6 Example. Here is an '\n",
            " 'example that was one of the first used to illustrate the bootstrap by '\n",
            " 'Bradley Efron; the inventor of the bootstrap. The data are LSAT scores (for '\n",
            " 'entrance to law school) and GPA_ 8.3 Bootstrap Confidence Intervals 113 '\n",
            " 'LS...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 5 of 7: weighted empirical risk'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('frequentist and bayesian inference lead to two different schools of '\n",
            " 'inference. the first is called empirical risk minimization or ERM ( see '\n",
            " 'section 6.5 for details ), so this overall approach is known as empirically '\n",
            " 'risk - minimized decision theory ( aikike information cri- terion)_the idea '\n",
            " 'is to choose s to maximize es ~ isl (13.27) where es is the log-likelihood '\n",
            " 'of the model evaluated at; the maryland economic policy institute ( the '\n",
            " 'markov chain of maria lomonosov leibniz university ) ; the second '\n",
            " 'interpretation is that we combine prior beliefs with data in a principled '\n",
            " 'way using Bayesian in- ference. 8.7 Example_ this example is from efron and '\n",
            " 'Tibshirani (1993). when drug companies introduce new medications, they are '\n",
            " 'sometimes required to show bioequivalence. an infinitely long, unpredictable '\n",
            " 'sequence of tosses whose limiting proportion tends to a constant is')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "(\"Here are some better estimates of risk: Mallow's Cp statistic is defined by \"\n",
            " 'R(S) = RtrC (8) + 2812 (13.26) where |S| denotes the number of terms in S '\n",
            " 'and 02 is the estimate of 02 obtained from the full model (with all '\n",
            " 'covariates in the model). This is simply the training error plus a bias '\n",
            " 'correction: This estimate is named in honor of Colin Mallows who invented '\n",
            " 'it. The first term in (13.26) measures the fit of the model while the second '\n",
            " 'measure the complexity of the model. To combine prior beliefs with data in a '\n",
            " 'principled way; use Bayesian in- ference. To construct procedures with '\n",
            " 'guaranteed long run performance, such as confidence intervals; use '\n",
            " 'frequentist methods. Generally; Bayesian methods run into problems when the '\n",
            " 'parameter space is high dimensional. In particu- lar , 95 percent posterior '\n",
            " 'intervals need not contain the true value 95 percent of the time (in the '\n",
            " 'frequency sense) - 11.10 Bibliographic Remarks Some references on Bayesian '\n",
            " 'inference include Carlin and Louis (1996) , Gel- man et al. (1995) , Lee '\n",
            " '(1997) , Robert (1994) , and Schervish (1995) . Figure &.1 shows the data '\n",
            " 'and a histogram of the bootstrap replications 0f_ B This histogram is an '\n",
            " 'approximation to the sampling distribution of 0. The Normal-based 95 percent '\n",
            " 'confidence interval is 78 + 2se (.51,1.00) while the percentile interval is '\n",
            " '(.46,.96) In large samples, the two methods will show closer agreement. 8.7 '\n",
            " 'Example_ This example is from Efron and Tibshirani (1993). When drug '\n",
            " 'companies introduce new medications, they are sometimes required to show '\n",
            " 'bioequivalence. An infinitely long, unpredictable sequence of tosses whose '\n",
            " 'limiting proportion tends to a constant is an idealization, much like the '\n",
            " 'idea of a straight line in geometry. The degree-of-belief interpretation is '\n",
            " \"that P(A) measures an observer's strength of belief that A is true. In \"\n",
            " 'either interpretation; we require that Axioms 1 to 3 hold. The difference in '\n",
            " 'inter- pretation will not matter much until we deal with statistical '\n",
            " 'inference. ...')\n",
            "\n",
            "\n",
            "~~~~~~~~~~~~~~~~~~\n",
            "\n",
            "'search_term 6 of 7: convergence of algorithm'\n",
            "\n",
            "\n",
            "\n",
            "+++++++\n",
            "\n",
            "\n",
            "model description: #1 of 1\n",
            "\n",
            "('in probability, convergence is more subtle : going back to calculus for a '\n",
            " 'moment, suppose that xn = x for all n _ then, trivially; limn-oo In x _ '\n",
            " 'Consider a probabilistic version of this example. (b) Illustration of the '\n",
            " 'fact that at the end of a line search (top of picture), the local gradient '\n",
            " 'of function will be perpendicular to the search direction based on Figure '\n",
            " '10.6.1 of (Press et al. 1988 ) we develop a more stable method for picking '\n",
            " 'the step size, so that the method is guaran- teed to converge to a local '\n",
            " 'optimum no matter where we start. ( this property is called global '\n",
            " 'convergence, which should not be confused with convergence to an optimum! ) '\n",
            " 'that is, the chain has converged when in fact it has not: this is a flaw '\n",
            " 'common to all convergence diagnostics, since diagnosing divergence is '\n",
            " 'computationally intractable in general ( bhatnagar')\n",
            "\n",
            " the context (first 2k chars) is: \n",
            "\n",
            "('MAP state estimation 801 one can show (Kolmogorov and Wainwright 2005) that '\n",
            " 'the max-product version of TRBP does solve the above LP relaxation: A '\n",
            " 'certain scheduling of this algorithm; known as sequential TRBP, TRBP-S, or '\n",
            " 'TRW-S, can be shown to always converge (Kolmogorov 2006), and furthermore, '\n",
            " 'it typically does so faster than the standard parallel updates. The idea is '\n",
            " 'to pick an arbitrary node ordering X1;. XN. We sum- marize that we obtain a '\n",
            " 'linear fit flm] (:) for the population minimizer f\" of the loss function. '\n",
            " 'With Lz Boosting (Algorithm 8) and the componentwise linear least squares '\n",
            " 'base procedure in (12.10) we obtain a linear model fit for every iteration '\n",
            " 'm. As m tends to infinity, flm] converges to a least squares solution. The '\n",
            " 'method is also known as matching pursuit in signal processing (Mallat and '\n",
            " 'Zhang, 1993), weak greedy algo- rithm in computational mathematics '\n",
            " '(Temlyakov, 2000), and itis a Gauss-Southwell algorithm (Southwell, 1946) '\n",
            " 'for solving a linear system of equations. (b) Illustration of the fact that '\n",
            " 'at the end of a line search (top of picture), the local gradient of the '\n",
            " 'function will be perpendicular to the search direction Based on Figure '\n",
            " '10.6.1 of (Press et al. 1988). Let us develop a more stable method for '\n",
            " 'picking the step size, so that the method is guaran- teed to converge to a '\n",
            " 'local optimum no matter where we start. (This property is called global '\n",
            " 'convergence, which should not be confused with convergence to the global '\n",
            " 'optimum!) That is, the method may claim the chain has converged when in fact '\n",
            " 'it has not: This is a flaw common to all convergence diagnostics, since '\n",
            " 'diagnosing convergence is computationally intractable in general (Bhatnagar '\n",
            " 'et al. 2010). In calculus we say that a sequence of real numbers Tn '\n",
            " 'converges to a limit I if, for every <CUR> 0, Ixn ~ x| <CUR> for all large '\n",
            " 'n. In probability, convergence is more subtle: Going back to calculus for a '\n",
            " 'moment, suppose that xn = x for all n _ Then, trivially; limn-oo In x _ '\n",
            " 'Consider a pr...')\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "application/javascript": [
              "download(\"download_ba3e04d0-3e9a-44c7-9b66-2cde1fb44c30\", \"SummarySearchTerms_intro to ML_exported_terms from past exams_allenailedlarge16384arxiv08.19.2021, 2308.txt\", 7109)"
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "text": [
            "\n",
            "Completed Summary Search -  08.19.2021, 23-08\n",
            "\n",
            "\n",
            "\n",
            "\n",
            " Moving to next term list 1629414487.868539 \n",
            "\n",
            "\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qMh68201DvER"
      },
      "source": [
        "\n",
        "---\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "D9JLAlTyJ4cE"
      },
      "source": [
        "# save data"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "r-mlRsM2Rflr"
      },
      "source": [
        "model data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VzmmjdINJ5m5",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "13138717-5747-4557-878b-88f5fd3a9b40"
      },
      "source": [
        "date_time = datetime.now().strftime(\"%m.%d.%Y, %H-%M\")\n",
        "header = remove_string_extras(course_name + \"_info-retrieval_\" + cust_model_name + date_time)\n",
        "\n",
        "qa_name = \"[QA-pipeline-{}]\".format(questions_version) + header + \".yaml\"\n",
        "ret_name = \"[Retriever-pipeline-{}]\".format(questions_version) + header + \".yaml\"\n",
        "sum_name = \"[Summarizer-pipeline-{}]\".format(questions_version) + header + \".yaml\"\n",
        "\n",
        "outnames = [qa_name, ret_name, sum_name]"
      ],
      "execution_count": 56,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "b1T7_slORfKW"
      },
      "source": [
        "dataframe of responses"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jw-pKzNFRfeF",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "f98c0183-35c5-4c75-b203-49b3127ad4ea"
      },
      "source": [
        "df_name_csv = \"Response DB - \" + header + \".csv\"\n",
        "info_queries.to_csv(df_name_csv)\n",
        "# download_file(df_name_csv, subfolder=\"haystack info retrieval - output queries\")\n",
        "\n",
        "df_name_xlsx = \"Response DB - \" + header + \".xlsx\"\n",
        "info_queries.to_excel(df_name_xlsx)\n",
        "download_file(df_name_xlsx,)"
      ],
      "execution_count": 57,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "application/javascript": [
              "download(\"download_356dcc3e-ede9-4307-bede-4aa0de728635\", \"Response DB - intro to ML_inforetrieval_googlebigbirdpegasuslargebigpatent08.19.2021, 2308.xlsx\", 38590)"
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YG7SZzS4R_Yo"
      },
      "source": [
        "## package information\n",
        "\n",
        "- for validation to ensure packages used in the script version vs. colab are relatively the same "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Xs4EOn82SDZU",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "outputId": "da1be147-54d8-4232-9da2-672ce25e59bf"
      },
      "source": [
        "from pip._internal.utils.misc import get_installed_distributions\n",
        "import sys\n",
        "#import numpy as np # imported to test whether numpy shows up, which it does!\n",
        "\n",
        "def get_imported_packages():\n",
        "    p = get_installed_distributions()\n",
        "    p = {package.key:package.version for package in p}\n",
        "\n",
        "    imported_modules = set(sys.modules.keys())\n",
        "    \n",
        "    imported_modules.remove('pip')\n",
        "\n",
        "    modules = [(m, p[m]) for m in imported_modules if p.get(m, False)]\n",
        "\n",
        "    return modules\n",
        "\n",
        "\n",
        "def generate_requirements(filepath:str, modules):\n",
        "    with open(filepath, 'w') as f:\n",
        "        for module, version in modules:\n",
        "            f.write(f\"{module}~={version}\" + \"\\n\")\n",
        "\n",
        "\n",
        "generate_requirements('requirements.txt', get_imported_packages())"
      ],
      "execution_count": 59,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "N185uzh1U2Wi"
      },
      "source": [
        "why are you reading this lmao"
      ]
    }
  ]
}

## kmeans_def.png

      
    Raw
  

              kmeans_def.png