avidale/bert-ner-ru.ipynb

## bert-ner-ru.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Bert-NER-ru",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.9"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "38effe6811e0445ea6a06fbf62322cb2": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_9c3e627f3f214708a0eada8855e345e1",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_4b4eaf184a9a482798ac08ff5febafb9",
              "IPY_MODEL_c3566754043543218423794f629214d6",
              "IPY_MODEL_b92ff60477c043e0b2bdaea88a70295b"
            ]
          }
        },
        "9c3e627f3f214708a0eada8855e345e1": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "4b4eaf184a9a482798ac08ff5febafb9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_5e07fb58967a4d03931682646417f324",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Downloading: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_c8847e2fd0654eabb62848c927685ba4"
          }
        },
        "c3566754043543218423794f629214d6": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_bb59a31387e84bc18937ca0aafb36230",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 341,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 341,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_7acb711b2f994c0c8c292f7735ccfd15"
          }
        },
        "b92ff60477c043e0b2bdaea88a70295b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_93c12e9ea0294f70b4dca310239b55e4",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 341/341 [00:00&lt;00:00, 6.59kB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_44595961fd42457496803732f50e813b"
          }
        },
        "5e07fb58967a4d03931682646417f324": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "c8847e2fd0654eabb62848c927685ba4": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "bb59a31387e84bc18937ca0aafb36230": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "7acb711b2f994c0c8c292f7735ccfd15": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "93c12e9ea0294f70b4dca310239b55e4": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "44595961fd42457496803732f50e813b": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "97db5fbd0b134002a15f1e89b2e2a871": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_4ac3dbe72f9b4dd2b27d91630736fbcc",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_937d10a0f080492ab64efbb53f6ebe83",
              "IPY_MODEL_3dab395a7da545838fb6c54fc19420cc",
              "IPY_MODEL_712ed73184ec4551a6891862b3295b22"
            ]
          }
        },
        "4ac3dbe72f9b4dd2b27d91630736fbcc": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "937d10a0f080492ab64efbb53f6ebe83": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_5281ff75b7144218a6218c65d8789f48",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Downloading: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_bedec2ca0adc4239b35f94a23a9b3cdd"
          }
        },
        "3dab395a7da545838fb6c54fc19420cc": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_8263aecd8ead4ceabf6112e536ac6dbd",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 632,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 632,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_e7f42df803cc4ed6ae43ead43e283ed8"
          }
        },
        "712ed73184ec4551a6891862b3295b22": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_57770139538d40c38c68112f622993f3",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 632/632 [00:00&lt;00:00, 20.9kB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_c3caeafc16f44882938d7c1dcb56cb18"
          }
        },
        "5281ff75b7144218a6218c65d8789f48": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "bedec2ca0adc4239b35f94a23a9b3cdd": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "8263aecd8ead4ceabf6112e536ac6dbd": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "e7f42df803cc4ed6ae43ead43e283ed8": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "57770139538d40c38c68112f622993f3": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "c3caeafc16f44882938d7c1dcb56cb18": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "0389bbf4dde74c4db0117a6f66c8808f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_bb94580fba5140078f7a4ac289308e6f",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_ff1d724506ef4ab1a73c4a6635a95cdb",
              "IPY_MODEL_6568a4b7ebba4b66ae324cfb1e4c6c56",
              "IPY_MODEL_37f3d4726084402ba1ff26773197b415"
            ]
          }
        },
        "bb94580fba5140078f7a4ac289308e6f": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "ff1d724506ef4ab1a73c4a6635a95cdb": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_6f02d9d6f80249e4949123b465d56584",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Downloading: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_f5e2e39b8c344f81b40f43c7645b6fae"
          }
        },
        "6568a4b7ebba4b66ae324cfb1e4c6c56": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_2f8e9faf6afe4b278a4889a13e29fd68",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 241082,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 241082,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_ff12a26f63e643fbb301dffe15f375cf"
          }
        },
        "37f3d4726084402ba1ff26773197b415": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_2fd0ded25025401d917bc2624b23a5ce",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 241k/241k [00:00&lt;00:00, 3.11MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_fd461fa9ee1249dbb49e10bf7201f9d9"
          }
        },
        "6f02d9d6f80249e4949123b465d56584": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "f5e2e39b8c344f81b40f43c7645b6fae": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "2f8e9faf6afe4b278a4889a13e29fd68": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "ff12a26f63e643fbb301dffe15f375cf": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "2fd0ded25025401d917bc2624b23a5ce": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "fd461fa9ee1249dbb49e10bf7201f9d9": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "91e9fdb905864efa8d42ee7cb3680e08": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_d8aa99a5f7cb42b9b3ca28edb0b0a007",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_0f4c5d11ce814b83bb85389b3c5a4c5f",
              "IPY_MODEL_62ae82032a8a403b989ee8e0ab06f58a",
              "IPY_MODEL_89ccf6c81454461c9a94ee0b9820d4a9"
            ]
          }
        },
        "d8aa99a5f7cb42b9b3ca28edb0b0a007": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "0f4c5d11ce814b83bb85389b3c5a4c5f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_c76403b23ddd4fd5a644e63e21da8358",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Downloading: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_303a6382c4ff45439a40c47e969306da"
          }
        },
        "62ae82032a8a403b989ee8e0ab06f58a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_24bff1add1804f5ea46b5edaf4ca7428",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 468145,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 468145,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_6e2ff1b9214049188ac0587fb33b5352"
          }
        },
        "89ccf6c81454461c9a94ee0b9820d4a9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_5f25f5e5ad604408bcb53d0b70f67f20",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 468k/468k [00:00&lt;00:00, 5.63MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_76260400334d4c4f8985e69b4800ae28"
          }
        },
        "c76403b23ddd4fd5a644e63e21da8358": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "303a6382c4ff45439a40c47e969306da": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "24bff1add1804f5ea46b5edaf4ca7428": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "6e2ff1b9214049188ac0587fb33b5352": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "5f25f5e5ad604408bcb53d0b70f67f20": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "76260400334d4c4f8985e69b4800ae28": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "21fdf7eb6fb94da0bc9639b3f4ea7f00": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_20b1444148dc40f89b107e30dbcf6c0b",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_36dc0d7a19c04a47a06618e187ee894a",
              "IPY_MODEL_791e009541a64d749330e6123ca7d87f",
              "IPY_MODEL_06c63accbd2a4903b762ed21545bfbbe"
            ]
          }
        },
        "20b1444148dc40f89b107e30dbcf6c0b": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "36dc0d7a19c04a47a06618e187ee894a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_07fee35962004c8996c8acef923292eb",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Downloading: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_b3332e5f66ad4c6c830f28bc290cd4bd"
          }
        },
        "791e009541a64d749330e6123ca7d87f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_b831ba8c276b4a1bb0ef7ae16a7a8fc9",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 112,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 112,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_7eff20962dfe422b9523c4e74f5372aa"
          }
        },
        "06c63accbd2a4903b762ed21545bfbbe": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_663c10e13a0e47f7b115ad50bd5b3965",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 112/112 [00:00&lt;00:00, 3.03kB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_997504803ac5445588c07cf97049d14a"
          }
        },
        "07fee35962004c8996c8acef923292eb": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "b3332e5f66ad4c6c830f28bc290cd4bd": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "b831ba8c276b4a1bb0ef7ae16a7a8fc9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "7eff20962dfe422b9523c4e74f5372aa": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "663c10e13a0e47f7b115ad50bd5b3965": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "997504803ac5445588c07cf97049d14a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "e16827a6f03a4b92889daf18d9917126": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_7b3185264cfe469683ad9cc81b0a8484",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_20ea5f6227b041969b1ce0d686a39121",
              "IPY_MODEL_78818c7330fd489c8820d845afab2fca",
              "IPY_MODEL_19c53fdf63a0408e8ad85e56a94d7dcd"
            ]
          }
        },
        "7b3185264cfe469683ad9cc81b0a8484": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "20ea5f6227b041969b1ce0d686a39121": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_43d9892c92f343369d714477c706d45a",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_91f00c3eed1b41d288a53cf829f88555"
          }
        },
        "78818c7330fd489c8820d845afab2fca": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_f2f2a65d8c5d4627855526eccf8c68d7",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 4,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 4,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_2c4a9988fa90474ba9aa1f48bf03704a"
          }
        },
        "19c53fdf63a0408e8ad85e56a94d7dcd": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_ec76c271eccd45c7b8d28a15274b1d50",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 4/4 [00:00&lt;00:00,  6.93ba/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_eefb8e1e66ff486e92c0ce618c12be66"
          }
        },
        "43d9892c92f343369d714477c706d45a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "91f00c3eed1b41d288a53cf829f88555": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "f2f2a65d8c5d4627855526eccf8c68d7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "2c4a9988fa90474ba9aa1f48bf03704a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "ec76c271eccd45c7b8d28a15274b1d50": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "eefb8e1e66ff486e92c0ce618c12be66": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "b27b3845581c4dcba258672ecde20982": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_16091398feea4bfb8e5c07a679934a3e",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_b6f776fb81874175a00f9d4569edf89c",
              "IPY_MODEL_9ae961550c684fbda20bbec6043ca80e",
              "IPY_MODEL_54e1fd908fed41c981e8bb39068da20c"
            ]
          }
        },
        "16091398feea4bfb8e5c07a679934a3e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "b6f776fb81874175a00f9d4569edf89c": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_c46c29c17e5d41128bb8bc401c5c4c8c",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_d0470494b6be4bf0af4fc7007e285149"
          }
        },
        "9ae961550c684fbda20bbec6043ca80e": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_9c406cfcfcf9424181939d2f6009090a",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 1,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 1,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_d2e8a291b9c54a45ba9f812ec6d19fcc"
          }
        },
        "54e1fd908fed41c981e8bb39068da20c": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_6fb72f595fb14608ae7e6a20c2e410a9",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 1/1 [00:00&lt;00:00,  5.44ba/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_e174a5cbd06441329ccd8cb547b44503"
          }
        },
        "c46c29c17e5d41128bb8bc401c5c4c8c": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "d0470494b6be4bf0af4fc7007e285149": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "9c406cfcfcf9424181939d2f6009090a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "d2e8a291b9c54a45ba9f812ec6d19fcc": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "6fb72f595fb14608ae7e6a20c2e410a9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "e174a5cbd06441329ccd8cb547b44503": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "178b2e70a03141c3a8d14c03b1024d34": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_a2a75ec023b64fb0813a0d3b9da549a3",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_6446a516afdf4406a5a8c876aa2a0179",
              "IPY_MODEL_787dde6c71aa4303a3f1ae908bdf3288",
              "IPY_MODEL_04e795570e2245e2844e7acfd36611e4"
            ]
          }
        },
        "a2a75ec023b64fb0813a0d3b9da549a3": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "6446a516afdf4406a5a8c876aa2a0179": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_a0441eab19d441d7ae60ea926a1ddabe",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Downloading: 100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_00a1bad590e34fc09067414ed4bae1d9"
          }
        },
        "787dde6c71aa4303a3f1ae908bdf3288": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_5b3223766ec44a0781e88c6f0d276c12",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 47679974,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 47679974,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_085cbbc409874d57b38930b6b05ecfd9"
          }
        },
        "04e795570e2245e2844e7acfd36611e4": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_1a255ac062e94624a2ecfb4f58889d74",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 47.7M/47.7M [00:01&lt;00:00, 47.3MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_1d51207792fd4afb849e2ae72ddd68ce"
          }
        },
        "a0441eab19d441d7ae60ea926a1ddabe": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "00a1bad590e34fc09067414ed4bae1d9": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "5b3223766ec44a0781e88c6f0d276c12": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "085cbbc409874d57b38930b6b05ecfd9": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "1a255ac062e94624a2ecfb4f58889d74": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "1d51207792fd4afb849e2ae72ddd68ce": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "bba28fea430d436981b0bfab06fb4ee6": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_fe16833673ba4dc79001d9b3d28eb6d2",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_513e9fa4a8a04f889d68ac7658818465",
              "IPY_MODEL_b38e9ab8c6fa4ea1952a2265dcdcfff5",
              "IPY_MODEL_91c93a7307854ac98d9bdf6746286517"
            ]
          }
        },
        "fe16833673ba4dc79001d9b3d28eb6d2": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "513e9fa4a8a04f889d68ac7658818465": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_c291de1c9ba343ff8e140a8c7ef1496a",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "Downloading: ",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_b1e72abd13c84f08b763eae36928fb4e"
          }
        },
        "b38e9ab8c6fa4ea1952a2265dcdcfff5": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_251fc9cb4f574aa4881b722cef11324a",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 2482,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 2482,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_71c5d3acf8674d77b38d6fa6b0aba8d5"
          }
        },
        "91c93a7307854ac98d9bdf6746286517": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_94419c78831745928f8e5327f53ef5e0",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 6.34k/? [00:00&lt;00:00, 157kB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_31a5bb08896f47309e658dac697b6680"
          }
        },
        "c291de1c9ba343ff8e140a8c7ef1496a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "b1e72abd13c84f08b763eae36928fb4e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "251fc9cb4f574aa4881b722cef11324a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "71c5d3acf8674d77b38d6fa6b0aba8d5": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "94419c78831745928f8e5327f53ef5e0": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "31a5bb08896f47309e658dac697b6680": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/avidale/cacf235aebeaaf4c578389e1146c3c57/bert-ner-ru.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "X4cRE8IbIrIV"
      },
      "source": [
        "Основано на блокноте https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MOsHUjgdIrIW"
      },
      "source": [
        "! pip install datasets transformers seqeval"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4HL1yaESsJA9"
      },
      "source": [
        "В этом блокноте мы дообучаем модель на задаче классификации отдельных слов, а именно, распознавание именованных сущностей (aka named entity recognition, aka NER). Мы возьмём датасет медицинских сущностей, но в целом пайплайн подходит для любой задачи на выделение сущностей в тексте. \n",
        "\n",
        "Для скорости мы возьмём маленький BERT для русского языка [rubert-tiny](https://huggingface.co/cointegrated/rubert-tiny); если взять другую, более крупную BERT-подобную модель, качество NER может быть выше, но и время обучения и работы будет дольше \n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4RRkXuteIrIh"
      },
      "source": [
        "This notebook is built to run on any token classification task, with any model checkpoint from the [Model Hub](https://huggingface.co/models) as long as that model has a version with a token classification head and a fast tokenizer (check on [this table](https://huggingface.co/transformers/index.html#bigtable) if this is the case). It might just need some small adjustments if you decide to use a different dataset than the one used here. Depending on you model and the GPU you are using, you might need to adjust the batch size to avoid out-of-memory errors. Set those three parameters, then the rest of the notebook should run smoothly:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zVvslsfMIrIh"
      },
      "source": [
        "model_checkpoint = \"cointegrated/rubert-tiny\"\n",
        "batch_size = 16"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "whPRbBNbIrIl"
      },
      "source": [
        "## Loading the dataset"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "J8mt63rWvkv3"
      },
      "source": [
        "Для обучения мы возьмём [Russian Drug Reaction Corpus](https://github.com/cimm-kzn/RuDReC): размеченный корпус русскоязычных отзывов на лекарства. \n",
        "\n",
        "Загрузим мы его библиотекой corus, потому что это удобно "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IreSlFmlIrIm"
      },
      "source": [
        "from datasets import load_dataset, load_metric"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "s_AY1ATSIrIq"
      },
      "source": [
        "!wget https://github.com/cimm-kzn/RuDReC/raw/master/data/rudrec_annotated.json\n",
        "!pip install corus razdel"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VALH-KBTMfVI",
        "outputId": "385f3c15-6aa7-4c9a-bd02-8db5c3a593fb"
      },
      "source": [
        "from corus import load_rudrec\n",
        "drugs = list(load_rudrec('rudrec_annotated.json'))\n",
        "print(len(drugs))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "4809\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fGBywJmAv2NN"
      },
      "source": [
        "Пример документа:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ynPlkV5gv4XC",
        "outputId": "e3fe1c20-6f9d-4921-d56b-71d810de8143"
      },
      "source": [
        "drugs[0]"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "RuDReCRecord(\n",
              "    file_name='172744.tsv',\n",
              "    text='нам прописали, так мой ребенок сыпью покрылся, глаза опухли, сверху и снизу на веках высыпала сыпь, ( 8 месяцев сыну)А от виферона такого не было... У кого ещё такие побочки, отзовитесь!1 Чем спасались?\\n',\n",
              "    sentence_id=0,\n",
              "    entities=[RuDReCEntity(\n",
              "         entity_id='*[0]_se',\n",
              "         entity_text='виферона',\n",
              "         entity_type='Drugform',\n",
              "         start=122,\n",
              "         end=130,\n",
              "         concept_id='C0021735',\n",
              "         concept_name=nan\n",
              "     ), RuDReCEntity(\n",
              "         entity_id='*[1]',\n",
              "         entity_text='сыпью покрылся',\n",
              "         entity_type='ADR',\n",
              "         start=31,\n",
              "         end=45,\n",
              "         concept_id='C0015230',\n",
              "         concept_name=nan\n",
              "     ), RuDReCEntity(\n",
              "         entity_id='*[2]',\n",
              "         entity_text='глаза опухли',\n",
              "         entity_type='ADR',\n",
              "         start=47,\n",
              "         end=59,\n",
              "         concept_id='C4760994',\n",
              "         concept_name=nan\n",
              "     ), RuDReCEntity(\n",
              "         entity_id='*[3]',\n",
              "         entity_text='на веках высыпала сыпь',\n",
              "         entity_type='ADR',\n",
              "         start=76,\n",
              "         end=98,\n",
              "         concept_id='C0015230',\n",
              "         concept_name=nan\n",
              "     )]\n",
              ")"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iSpV6RLEwI5o"
      },
      "source": [
        "Посмотрим, какие сущности есть: лекарства, форма лекарств, класс лекарств, показания к применению, побочки, и прочие болезни/симптомы.\n",
        "\n",
        "https://arxiv.org/abs/2004.03659\n",
        "\n",
        "* **DRUGNAME** Mentions of the brand name of a drug or product\n",
        "ingredients/active compounds.\n",
        "* **DRUGCLASS** Mentions of drug classes such as anti-inflammatory or\n",
        "cardiovascular.\n",
        "* **DRUGFORM** Mentions of routes of administration such as tablet\n",
        "or liquid that describe the physical form in which\n",
        "medication will be delivered into patient’s organism.\n",
        "* **DI** Any indication/symptom that specifies the reason for\n",
        "taking/prescribing the drug.\n",
        "* **ADR** Mentions of untoward medical events that occur as a\n",
        "consequence of drug intake and are not associated with\n",
        "treated symptoms.\n",
        "* **FINDING** Any DI or ADR that was not directly experienced by the\n",
        "reporting patient or his/her family members, or related to\n",
        "medical history/drug label, or any disease entities if the\n",
        "annotator is not clear about type"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vo8MIceYNPjx",
        "outputId": "23d8f04f-f7d3-4a20-b840-a8b915b8ab47"
      },
      "source": [
        "from collections import Counter, defaultdict\n",
        "type2text = defaultdict(Counter)\n",
        "ents = Counter()\n",
        "for item in drugs:\n",
        "    for e in item.entities:\n",
        "        ents[e.entity_type] += 1\n",
        "        type2text[e.entity_type][e.entity_text] += 1\n",
        "\n",
        "for k, v in ents.most_common():\n",
        "    print(k, v)\n",
        "    print(type2text[k].most_common(3))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "DI 1401\n",
            "[('простуды', 64), ('ОРВИ', 47), ('профилактики', 42)]\n",
            "Drugname 1043\n",
            "[('Виферон', 33), ('Анаферон', 25), ('Циклоферон', 24)]\n",
            "Drugform 836\n",
            "[('таблетки', 154), ('таблеток', 79), ('свечи', 63)]\n",
            "ADR 720\n",
            "[('аллергия', 16), ('слабость', 13), ('диарея', 12)]\n",
            "Drugclass 330\n",
            "[('противовирусный', 21), ('противовирусное', 18), ('противовирусных', 13)]\n",
            "Finding 236\n",
            "[('аллергии', 12), ('температуры', 6), ('сонливости', 5)]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        },
        "id": "0Kszaqs8N0Ig",
        "outputId": "34a697ef-c96d-40bb-c6ed-fe6f2c979ab1"
      },
      "source": [
        "drugs[0].text"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'нам прописали, так мой ребенок сыпью покрылся, глаза опухли, сверху и снизу на веках высыпала сыпь, ( 8 месяцев сыну)А от виферона такого не было... У кого ещё такие побочки, отзовитесь!1 Чем спасались?\\n'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "RzfPtOMoIrIu"
      },
      "source": [
        "Напишем функцию, перекладывающую разметку сущностей на уровень слов. Будем использовать [IOB](https://en.wikipedia.org/wiki/Inside–outside–beginning_(tagging))-нотацию, чтобы разделять несколько сущностей одного типа, идущих подряд. "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Dg9BL4Z_OcjY"
      },
      "source": [
        "from razdel import tokenize\n",
        "\n",
        "def extract_labels(item):\n",
        "    raw_toks = list(tokenize(item.text))\n",
        "    words = [tok.text for tok in raw_toks]\n",
        "    word_labels = ['O'] * len(raw_toks)\n",
        "    char2word = [None] * len(item.text)\n",
        "    for i, word in enumerate(raw_toks):\n",
        "        char2word[word.start:word.stop] = [i] * len(word.text)\n",
        "\n",
        "    for e in item.entities:\n",
        "        e_words = sorted({idx for idx in char2word[e.start:e.end] if idx is not None})\n",
        "        word_labels[e_words[0]] = 'B-' + e.entity_type\n",
        "        for idx in e_words[1:]:\n",
        "            word_labels[idx] = 'I-' + e.entity_type\n",
        "\n",
        "    return {'tokens': words, 'tags': word_labels}"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PCGwQAadOVA9",
        "outputId": "cb55c0b3-bdc5-4b5c-feae-c560c38554cd"
      },
      "source": [
        "print(extract_labels(drugs[0]))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{'tokens': ['нам', 'прописали', ',', 'так', 'мой', 'ребенок', 'сыпью', 'покрылся', ',', 'глаза', 'опухли', ',', 'сверху', 'и', 'снизу', 'на', 'веках', 'высыпала', 'сыпь', ',', '(', '8', 'месяцев', 'сыну', ')', 'А', 'от', 'виферона', 'такого', 'не', 'было', '...', 'У', 'кого', 'ещё', 'такие', 'побочки', ',', 'отзовитесь', '!', '1', 'Чем', 'спасались', '?'], 'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'B-ADR', 'I-ADR', 'O', 'B-ADR', 'I-ADR', 'O', 'O', 'O', 'O', 'B-ADR', 'I-ADR', 'I-ADR', 'I-ADR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Drugform', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Chhlmjt8OEgn"
      },
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "ner_data = [extract_labels(item) for item in drugs]\n",
        "ner_train, ner_test = train_test_split(ner_data, test_size=0.2, random_state=1)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yvApziHbyUyR"
      },
      "source": [
        "Пример данных"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 137
        },
        "id": "17yA19oFRwMk",
        "outputId": "0f5322ef-f6cf-4099-a34c-d298a3a72f72"
      },
      "source": [
        "import pandas as pd\n",
        "pd.options.display.max_colwidth = 300\n",
        "pd.DataFrame(ner_train).sample(3)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tokens</th>\n",
              "      <th>tags</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>3132</th>\n",
              "      <td>[Но, в, 3, месяца, нам, ставили, гипертонус, ручек, и, ножек, .]</td>\n",
              "      <td>[O, O, O, O, O, O, B-DI, I-DI, I-DI, I-DI, O]</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>355</th>\n",
              "      <td>[У, меня, двое, детей, .]</td>\n",
              "      <td>[O, O, O, O, O]</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3101</th>\n",
              "      <td>[Не, спорю, наслышана, о, широте, его, применения, ,, но, нам, он, не, подошел, абсолютно, !]</td>\n",
              "      <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                                                             tokens                                           tags\n",
              "3132                               [Но, в, 3, месяца, нам, ставили, гипертонус, ручек, и, ножек, .]  [O, O, O, O, O, O, B-DI, I-DI, I-DI, I-DI, O]\n",
              "355                                                                       [У, меня, двое, детей, .]                                [O, O, O, O, O]\n",
              "3101  [Не, спорю, наслышана, о, широте, его, применения, ,, но, нам, он, не, подошел, абсолютно, !]  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sE0souTBykq1"
      },
      "source": [
        "Соберём все виды меток в список. "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "16SRNc6csJBC",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "a06bc57e-5b17-4b45-8009-a7cbfc5e7592"
      },
      "source": [
        "label_list = sorted({label for item in ner_train for label in item['tags']})\n",
        "if 'O' in label_list:\n",
        "    label_list.remove('O')\n",
        "    label_list = ['O'] + label_list\n",
        "label_list"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['O',\n",
              " 'B-ADR',\n",
              " 'B-DI',\n",
              " 'B-Drugclass',\n",
              " 'B-Drugform',\n",
              " 'B-Drugname',\n",
              " 'B-Finding',\n",
              " 'I-ADR',\n",
              " 'I-DI',\n",
              " 'I-Drugclass',\n",
              " 'I-Drugform',\n",
              " 'I-Drugname',\n",
              " 'I-Finding']"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 13
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ckjbVWLoyYYf"
      },
      "source": [
        "Сложим наши данные в объект [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), нативный для huggingface."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4E3yy6wmUp-z"
      },
      "source": [
        "from datasets import Dataset, DatasetDict"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "3YavIR4eU5ZY",
        "outputId": "8b2ae069-88ac-405a-83a8-ff4a73d7d215"
      },
      "source": [
        "ner_data = DatasetDict({\n",
        "    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),\n",
        "    'test': Dataset.from_pandas(pd.DataFrame(ner_test))\n",
        "})\n",
        "ner_data"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "DatasetDict({\n",
              "    train: Dataset({\n",
              "        features: ['tokens', 'tags'],\n",
              "        num_rows: 3847\n",
              "    })\n",
              "    test: Dataset({\n",
              "        features: ['tokens', 'tags'],\n",
              "        num_rows: 962\n",
              "    })\n",
              "})"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 15
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "n9qywopnIrJH"
      },
      "source": [
        "## Preprocessing the data"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YVx71GdAIrJH"
      },
      "source": [
        "Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.\n",
        "\n",
        "To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:\n",
        "\n",
        "- we get a tokenizer that corresponds to the model architecture we want to use,\n",
        "- we download the vocabulary used when pretraining this specific checkpoint.\n",
        "\n",
        "That vocabulary will be cached, so it's not downloaded again the next time we run the cell."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "eXNLu_-nIrJI",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 177,
          "referenced_widgets": [
            "38effe6811e0445ea6a06fbf62322cb2",
            "9c3e627f3f214708a0eada8855e345e1",
            "4b4eaf184a9a482798ac08ff5febafb9",
            "c3566754043543218423794f629214d6",
            "b92ff60477c043e0b2bdaea88a70295b",
            "5e07fb58967a4d03931682646417f324",
            "c8847e2fd0654eabb62848c927685ba4",
            "bb59a31387e84bc18937ca0aafb36230",
            "7acb711b2f994c0c8c292f7735ccfd15",
            "93c12e9ea0294f70b4dca310239b55e4",
            "44595961fd42457496803732f50e813b",
            "97db5fbd0b134002a15f1e89b2e2a871",
            "4ac3dbe72f9b4dd2b27d91630736fbcc",
            "937d10a0f080492ab64efbb53f6ebe83",
            "3dab395a7da545838fb6c54fc19420cc",
            "712ed73184ec4551a6891862b3295b22",
            "5281ff75b7144218a6218c65d8789f48",
            "bedec2ca0adc4239b35f94a23a9b3cdd",
            "8263aecd8ead4ceabf6112e536ac6dbd",
            "e7f42df803cc4ed6ae43ead43e283ed8",
            "57770139538d40c38c68112f622993f3",
            "c3caeafc16f44882938d7c1dcb56cb18",
            "0389bbf4dde74c4db0117a6f66c8808f",
            "bb94580fba5140078f7a4ac289308e6f",
            "ff1d724506ef4ab1a73c4a6635a95cdb",
            "6568a4b7ebba4b66ae324cfb1e4c6c56",
            "37f3d4726084402ba1ff26773197b415",
            "6f02d9d6f80249e4949123b465d56584",
            "f5e2e39b8c344f81b40f43c7645b6fae",
            "2f8e9faf6afe4b278a4889a13e29fd68",
            "ff12a26f63e643fbb301dffe15f375cf",
            "2fd0ded25025401d917bc2624b23a5ce",
            "fd461fa9ee1249dbb49e10bf7201f9d9",
            "91e9fdb905864efa8d42ee7cb3680e08",
            "d8aa99a5f7cb42b9b3ca28edb0b0a007",
            "0f4c5d11ce814b83bb85389b3c5a4c5f",
            "62ae82032a8a403b989ee8e0ab06f58a",
            "89ccf6c81454461c9a94ee0b9820d4a9",
            "c76403b23ddd4fd5a644e63e21da8358",
            "303a6382c4ff45439a40c47e969306da",
            "24bff1add1804f5ea46b5edaf4ca7428",
            "6e2ff1b9214049188ac0587fb33b5352",
            "5f25f5e5ad604408bcb53d0b70f67f20",
            "76260400334d4c4f8985e69b4800ae28",
            "21fdf7eb6fb94da0bc9639b3f4ea7f00",
            "20b1444148dc40f89b107e30dbcf6c0b",
            "36dc0d7a19c04a47a06618e187ee894a",
            "791e009541a64d749330e6123ca7d87f",
            "06c63accbd2a4903b762ed21545bfbbe",
            "07fee35962004c8996c8acef923292eb",
            "b3332e5f66ad4c6c830f28bc290cd4bd",
            "b831ba8c276b4a1bb0ef7ae16a7a8fc9",
            "7eff20962dfe422b9523c4e74f5372aa",
            "663c10e13a0e47f7b115ad50bd5b3965",
            "997504803ac5445588c07cf97049d14a"
          ]
        },
        "outputId": "61183128-8efa-44c0-83e5-1bddd1d0fc06"
      },
      "source": [
        "from transformers import AutoTokenizer\n",
        "    \n",
        "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "38effe6811e0445ea6a06fbf62322cb2",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "97db5fbd0b134002a15f1e89b2e2a871",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "0389bbf4dde74c4db0117a6f66c8808f",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/241k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "91e9fdb905864efa8d42ee7cb3680e08",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/468k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "21fdf7eb6fb94da0bc9639b3f4ea7f00",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rowT4iCLIrJK"
      },
      "source": [
        "You can directly call this tokenizer on one sentence:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "a5hBlsrHIrJL",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "685cf29d-15c9-40ac-e802-ae3035b0ca14"
      },
      "source": [
        "tokenizer(\"Hello, this is one sentence!\")"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'input_ids': [2, 9944, 16, 881, 550, 835, 15503, 5, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 17
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZdStg37nsJBE"
      },
      "source": [
        "Depending on the model you selected, you will see different keys in the dictionary returned by the cell above. They don't matter much for what we're doing here (just know they are required by the model we will instantiate later), you can learn more about them in [this tutorial](https://huggingface.co/transformers/preprocessing.html) if you're interested.\n",
        "\n",
        "If, as is the case here, your inputs have already been split into words, you should pass the list of words to your tokenzier with the argument `is_split_into_words=True`:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "b_yJ2hgDsJBF",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "211c06e6-ca98-46db-eac3-0f4c6809a400"
      },
      "source": [
        "tokenizer([\"Hello\", \",\", \"this\", \"is\", \"one\", \"sentence\", \"split\", \"into\", \"words\", \".\"], is_split_into_words=True)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'input_ids': [2, 9944, 16, 881, 550, 835, 15503, 7440, 996, 6301, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 18
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4JdDuFvbsJBF"
      },
      "source": [
        "Note that transformers are often pretrained with subword tokenizers, meaning that even if your inputs have been split into words already, each of those words could be split again by the tokenizer. Let's look at an example of that:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OjrkjteOsJBF",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "a2841a74-4493-4417-cad8-e53d87875542"
      },
      "source": [
        "example = ner_train[5]\n",
        "print(example[\"tokens\"])"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "['Мы', 'поменяли', 'место', 'жительства', 'и', 'перевели', 'дочь', 'в', 'школу', ',', 'которая', 'находится', 'ближе', 'к', 'дому', '.']\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QU8fkdJMsJBF",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "9eec8122-4df6-43d8-8b53-52b296e1b640"
      },
      "source": [
        "tokenized_input = tokenizer(example[\"tokens\"], is_split_into_words=True)\n",
        "tokens = tokenizer.convert_ids_to_tokens(tokenized_input[\"input_ids\"])\n",
        "print(tokens)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "['[CLS]', 'Мы', 'пом', '##ен', '##яли', 'место', 'ж', '##итель', '##ства', 'и', 'пер', '##еве', '##ли', 'дочь', 'в', 'школу', ',', 'которая', 'находится', 'б', '##ли', '##же', 'к', 'дому', '.', '[SEP]']\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-7zwzY9wsJBG"
      },
      "source": [
        "Чтобы перейти с уровня слов на уровень subword tokens, нужно ещё раз предобработать тексты."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "F39uz6wusJBG",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "b0d90db2-f812-4478-bb30-b57235a6a204"
      },
      "source": [
        "len(example[\"tags\"]), len(tokenized_input[\"input_ids\"])"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(16, 26)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 21
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pnBazSrTsJBG"
      },
      "source": [
        "Thankfully, the tokenizer returns outputs that have a `word_ids` method which can help us."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Rt7_5_bXsJBH",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "90d5e41e-92e4-4199-bcac-f71e2dd3488c"
      },
      "source": [
        "print(tokenized_input.word_ids())"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[None, 0, 1, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 13, 14, 15, None]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rP1PvW2isJBH"
      },
      "source": [
        "As we can see, it returns a list with the same number of elements as our processed input ids, mapping special tokens to `None` and all other tokens to their respective word. This way, we can align the labels with the processed input ids."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NeVhtoANsJBH",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "dcc7ae1b-6d60-4abb-c33f-ba9bad0bd859"
      },
      "source": [
        "word_ids = tokenized_input.word_ids()\n",
        "aligned_labels = [-100 if i is None else example[\"tags\"][i] for i in word_ids]\n",
        "print(len(aligned_labels), len(tokenized_input[\"input_ids\"]))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "26 26\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MM4fgSPDsJBH"
      },
      "source": [
        "Here we set the labels of all special tokens to -100 (the index that is ignored by PyTorch) and the labels of all other tokens to the label of the word they come from. Another strategy is to set the label only on the first token obtained from a given word, and give a label of -100 to the other subtokens from the same word. We propose the two strategies here, just change the flag `label_all_tokens`."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2C0hcmp9IrJQ"
      },
      "source": [
        "We're now ready to write the function that will preprocess our samples. We feed them to the `tokenizer` with the argument `truncation=True` (to truncate texts that are bigger than the maximum size allowed by the model) and `is_split_into_words=True` (as seen above). Then we align the labels with the token ids using the strategy we picked:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vc0BSBLIIrJQ"
      },
      "source": [
        "def tokenize_and_align_labels(examples, label_all_tokens=True):\n",
        "    tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n",
        "\n",
        "    labels = []\n",
        "    for i, label in enumerate(examples['tags']):\n",
        "        word_ids = tokenized_inputs.word_ids(batch_index=i)\n",
        "        previous_word_idx = None\n",
        "        label_ids = []\n",
        "        for word_idx in word_ids:\n",
        "            # Special tokens have a word id that is None. We set the label to -100 so they are automatically\n",
        "            # ignored in the loss function.\n",
        "            if word_idx is None:\n",
        "                label_ids.append(-100)\n",
        "            # We set the label for the first token of each word.\n",
        "            elif word_idx != previous_word_idx:\n",
        "                label_ids.append(label[word_idx])\n",
        "            # For the other tokens in a word, we set the label to either the current label or -100, depending on\n",
        "            # the label_all_tokens flag.\n",
        "            else:\n",
        "                label_ids.append(label[word_idx] if label_all_tokens else -100)\n",
        "            previous_word_idx = word_idx\n",
        "\n",
        "        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]\n",
        "\n",
        "        labels.append(label_ids)\n",
        "\n",
        "    tokenized_inputs[\"labels\"] = labels\n",
        "    return tokenized_inputs"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0lm8ozrJIrJR"
      },
      "source": [
        "This function works with one or several examples. In the case of several examples, the tokenizer will return a list of lists for each key:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-b70jh26IrJS",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "7d76f9ad-a433-444f-b118-2bdb9d2013cd"
      },
      "source": [
        "tokenize_and_align_labels(ner_data['train'][22:23])"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'input_ids': [[2, 1041, 4033, 3236, 9267, 331, 19173, 19106, 26629, 1887, 22018, 548, 22276, 320, 21538, 16, 705, 13718, 22264, 548, 18397, 14063, 11137, 626, 16296, 24531, 18, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 25
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zS-6iXTkIrJT"
      },
      "source": [
        "To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the `map` method of our `dataset` object we created earlier. This will apply the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DDtsaJeVIrJT",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 81,
          "referenced_widgets": [
            "e16827a6f03a4b92889daf18d9917126",
            "7b3185264cfe469683ad9cc81b0a8484",
            "20ea5f6227b041969b1ce0d686a39121",
            "78818c7330fd489c8820d845afab2fca",
            "19c53fdf63a0408e8ad85e56a94d7dcd",
            "43d9892c92f343369d714477c706d45a",
            "91f00c3eed1b41d288a53cf829f88555",
            "f2f2a65d8c5d4627855526eccf8c68d7",
            "2c4a9988fa90474ba9aa1f48bf03704a",
            "ec76c271eccd45c7b8d28a15274b1d50",
            "eefb8e1e66ff486e92c0ce618c12be66",
            "b27b3845581c4dcba258672ecde20982",
            "16091398feea4bfb8e5c07a679934a3e",
            "b6f776fb81874175a00f9d4569edf89c",
            "9ae961550c684fbda20bbec6043ca80e",
            "54e1fd908fed41c981e8bb39068da20c",
            "c46c29c17e5d41128bb8bc401c5c4c8c",
            "d0470494b6be4bf0af4fc7007e285149",
            "9c406cfcfcf9424181939d2f6009090a",
            "d2e8a291b9c54a45ba9f812ec6d19fcc",
            "6fb72f595fb14608ae7e6a20c2e410a9",
            "e174a5cbd06441329ccd8cb547b44503"
          ]
        },
        "outputId": "b600760b-c8ce-4744-b0bc-fa777a6e9331"
      },
      "source": [
        "tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "e16827a6f03a4b92889daf18d9917126",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "  0%|          | 0/4 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "b27b3845581c4dcba258672ecde20982",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "  0%|          | 0/1 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "voWiw8C7IrJV"
      },
      "source": [
        "Even better, the results are automatically cached by the 🤗 Datasets library to avoid spending time on this step the next time you run your notebook. The 🤗 Datasets library is normally smart enough to detect when the function you pass to map has changed (and thus requires to not use the cache data). For instance, it will properly detect if you change the task in the first cell and rerun the notebook. 🤗 Datasets warns you when it uses cached files, you can pass `load_from_cache_file=False` in the call to `map` to not use the cached files and force the preprocessing to be applied again.\n",
        "\n",
        "Note that we passed `batched=True` to encode the texts by batches together. This is to leverage the full benefit of the fast tokenizer we loaded earlier, which will use multi-threading to treat the texts in a batch concurrently."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "545PP3o8IrJV"
      },
      "source": [
        "## Fine-tuning the model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FBiW8UpKIrJW"
      },
      "source": [
        "Now that our data is ready, we can download the pretrained model and fine-tune it. Since all our tasks are about token classification, we use the `AutoModelForTokenClassification` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us. The only thing we have to specify is the number of labels for our problem (which we can get from the features, as seen before):"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4d2zEASrUs1E",
        "outputId": "7f0158f6-0aae-4fac-a11c-ad4996558bfc"
      },
      "source": [
        "label_list"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['O',\n",
              " 'B-ADR',\n",
              " 'B-DI',\n",
              " 'B-Drugclass',\n",
              " 'B-Drugform',\n",
              " 'B-Drugname',\n",
              " 'B-Finding',\n",
              " 'I-ADR',\n",
              " 'I-DI',\n",
              " 'I-Drugclass',\n",
              " 'I-Drugform',\n",
              " 'I-Drugname',\n",
              " 'I-Finding']"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 28
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TlqNaB8jIrJW",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 152,
          "referenced_widgets": [
            "178b2e70a03141c3a8d14c03b1024d34",
            "a2a75ec023b64fb0813a0d3b9da549a3",
            "6446a516afdf4406a5a8c876aa2a0179",
            "787dde6c71aa4303a3f1ae908bdf3288",
            "04e795570e2245e2844e7acfd36611e4",
            "a0441eab19d441d7ae60ea926a1ddabe",
            "00a1bad590e34fc09067414ed4bae1d9",
            "5b3223766ec44a0781e88c6f0d276c12",
            "085cbbc409874d57b38930b6b05ecfd9",
            "1a255ac062e94624a2ecfb4f58889d74",
            "1d51207792fd4afb849e2ae72ddd68ce"
          ]
        },
        "outputId": "4adff280-2147-4280-aff7-25b4b22c22f1"
      },
      "source": [
        "from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer\n",
        "\n",
        "model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))\n",
        "model.config.id2label = dict(enumerate(label_list))\n",
        "model.config.label2id = {v: k for k, v in model.config.id2label.items()}"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "178b2e70a03141c3a8d14c03b1024d34",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/47.7M [00:00<?, ?B/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']\n",
            "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
            "Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CczA5lJlIrJX"
      },
      "source": [
        "The warning is telling us we are throwing away some weights (the `vocab_transform` and `vocab_layer_norm` layers) and randomly initializing some other (the `pre_classifier` and `classifier` layers). This is absolutely normal in this case, because we are removing the head used to pretrain the model on a masked language modeling objective and replacing it with a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_N8urzhyIrJY"
      },
      "source": [
        "To instantiate a `Trainer`, we will need to define three more things. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Bliy8zgjIrJY"
      },
      "source": [
        "args = TrainingArguments(\n",
        "    \"ner\",\n",
        "    evaluation_strategy = \"epoch\",\n",
        "    learning_rate=2e-5,\n",
        "    per_device_train_batch_size=batch_size,\n",
        "    per_device_eval_batch_size=batch_size,\n",
        "    num_train_epochs=10,\n",
        "    weight_decay=0.01,\n",
        "    save_strategy='no',\n",
        "    report_to='none',\n",
        ")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "km3pGVdTIrJc"
      },
      "source": [
        "Here we set the evaluation to be done at the end of each epoch, tweak the learning rate, use the `batch_size` defined at the top of the notebook and customize the number of epochs for training, as well as the weight decay."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4e6jrE3TsJBM"
      },
      "source": [
        "Then we will need a data collator that will batch our processed examples together while applying padding to make them all the same size (each pad will be padded to the length of its longest example). There is a data collator for this task in the Transformers library, that not only pads the inputs, but also the labels:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pyiUUwuCsJBM"
      },
      "source": [
        "from transformers import DataCollatorForTokenClassification\n",
        "\n",
        "data_collator = DataCollatorForTokenClassification(tokenizer)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YY7DtOOesJBM"
      },
      "source": [
        "The last thing to define for our `Trainer` is how to compute the metrics from the predictions. Here we will load the [`seqeval`](https://github.com/chakki-works/seqeval) metric (which is commonly used to evaluate results on the CONLL dataset) via the Datasets library."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qFF2_ArssJBM",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "bba28fea430d436981b0bfab06fb4ee6",
            "fe16833673ba4dc79001d9b3d28eb6d2",
            "513e9fa4a8a04f889d68ac7658818465",
            "b38e9ab8c6fa4ea1952a2265dcdcfff5",
            "91c93a7307854ac98d9bdf6746286517",
            "c291de1c9ba343ff8e140a8c7ef1496a",
            "b1e72abd13c84f08b763eae36928fb4e",
            "251fc9cb4f574aa4881b722cef11324a",
            "71c5d3acf8674d77b38d6fa6b0aba8d5",
            "94419c78831745928f8e5327f53ef5e0",
            "31a5bb08896f47309e658dac697b6680"
          ]
        },
        "outputId": "d0e04cc6-0334-48a3-851e-9688ee5127cf"
      },
      "source": [
        "metric = load_metric(\"seqeval\")"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "bba28fea430d436981b0bfab06fb4ee6",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Ennxn1jysJBM"
      },
      "source": [
        "This metric takes list of labels for the predictions and references:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "YOfoAVULsJBN",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "93aa3610-aa5b-4318-c3d2-02171e9596e9"
      },
      "source": [
        "example = ner_train[4]\n",
        "labels = example['tags']\n",
        "metric.compute(predictions=[labels], references=[labels])"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'DI': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},\n",
              " 'Drugform': {'f1': 1.0, 'number': 2, 'precision': 1.0, 'recall': 1.0},\n",
              " 'overall_accuracy': 1.0,\n",
              " 'overall_f1': 1.0,\n",
              " 'overall_precision': 1.0,\n",
              " 'overall_recall': 1.0}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 32
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7sZOdRlRIrJd"
      },
      "source": [
        "So we will need to do a bit of post-processing on our predictions:\n",
        "- select the predicted index (with the maximum logit) for each token\n",
        "- convert it to its string label\n",
        "- ignore everywhere we set a label of -100\n",
        "\n",
        "The following function does all this post-processing on the result of `Trainer.evaluate` (which is a namedtuple containing predictions and labels) before applying the metric:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "UmvbnJ9JIrJd"
      },
      "source": [
        "import numpy as np\n",
        "\n",
        "def compute_metrics(p):\n",
        "    predictions, labels = p\n",
        "    predictions = np.argmax(predictions, axis=2)\n",
        "\n",
        "    # Remove ignored index (special tokens)\n",
        "    true_predictions = [\n",
        "        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n",
        "        for prediction, label in zip(predictions, labels)\n",
        "    ]\n",
        "    true_labels = [\n",
        "        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n",
        "        for prediction, label in zip(predictions, labels)\n",
        "    ]\n",
        "\n",
        "    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)\n",
        "    return {\n",
        "        \"precision\": results[\"overall_precision\"],\n",
        "        \"recall\": results[\"overall_recall\"],\n",
        "        \"f1\": results[\"overall_f1\"],\n",
        "        \"accuracy\": results[\"overall_accuracy\"],\n",
        "    }"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rXuFTAzDIrJe"
      },
      "source": [
        "Note that we drop the precision/recall/f1 computed for each category and only focus on the overall precision/recall/f1/accuracy.\n",
        "\n",
        "Then we just need to pass all of this along with our datasets to the `Trainer`:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "imY1oC3SIrJf"
      },
      "source": [
        "trainer = Trainer(\n",
        "    model,\n",
        "    args,\n",
        "    train_dataset=tokenized_datasets[\"train\"],\n",
        "    eval_dataset=tokenized_datasets[\"test\"],\n",
        "    data_collator=data_collator,\n",
        "    tokenizer=tokenizer,\n",
        "    compute_metrics=compute_metrics\n",
        ")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 238
        },
        "id": "ZP1dvvNlXg9Y",
        "outputId": "5615b5dc-3794-48d4-af11-12d9a6a0eac6"
      },
      "source": [
        "trainer.evaluate()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags.\n",
            "***** Running Evaluation *****\n",
            "  Num examples = 962\n",
            "  Batch size = 16\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='61' max='61' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [61/61 00:00]\n",
              "    </div>\n",
              "    "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'eval_accuracy': 0.07571226846083562,\n",
              " 'eval_f1': 0.03137110167927662,\n",
              " 'eval_loss': 2.604278326034546,\n",
              " 'eval_precision': 0.018480269594521145,\n",
              " 'eval_recall': 0.10372178157413056,\n",
              " 'eval_runtime': 1.5067,\n",
              " 'eval_samples_per_second': 638.492,\n",
              " 'eval_steps_per_second': 40.486}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 35
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "a-xw5JvKzyrf"
      },
      "source": [
        "В начале обучения заморозим все параметры в модели, кроме последнего слоя, и посмотрим, насколько хорошо она обучится."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lzwwl_YQWKxq"
      },
      "source": [
        "for param in model.bert.parameters():\n",
        "    param.requires_grad = False"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "EhRisAHxWZRG",
        "outputId": "95232006-ea19-46f8-a893-4fdfa44805f1"
      },
      "source": [
        "for name, param in model.named_parameters():\n",
        "    if param.requires_grad:\n",
        "        print(name)\n",
        "        print(param)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "classifier.weight\n",
            "Parameter containing:\n",
            "tensor([[-5.3295e-02,  8.1591e-05, -1.4091e-02,  ...,  9.4435e-03,\n",
            "          2.6371e-02, -2.7459e-02],\n",
            "        [-1.4154e-02,  1.8980e-02, -6.4149e-03,  ..., -3.0063e-02,\n",
            "         -8.0335e-03, -1.3474e-02],\n",
            "        [ 3.9226e-03, -1.7339e-03, -2.4043e-03,  ...,  1.1911e-02,\n",
            "         -6.8623e-03, -3.6764e-02],\n",
            "        ...,\n",
            "        [ 2.9699e-02, -2.5830e-02,  2.9956e-03,  ...,  2.0724e-02,\n",
            "          2.6304e-02, -1.3127e-04],\n",
            "        [-2.8258e-02,  1.9521e-03, -1.2629e-02,  ..., -2.4292e-02,\n",
            "         -1.9133e-02,  3.5226e-02],\n",
            "        [ 4.8563e-03, -3.9019e-02,  2.2573e-02,  ...,  2.3094e-02,\n",
            "         -5.4334e-03, -3.1281e-02]], device='cuda:0', requires_grad=True)\n",
            "classifier.bias\n",
            "Parameter containing:\n",
            "tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0',\n",
            "       requires_grad=True)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CdzABDVcIrJg"
      },
      "source": [
        "We can now finetune our model by just calling the `train` method:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nsuTXCjMeYHE"
      },
      "source": [
        "import logging\n",
        "from transformers.trainer import logger as noisy_logger\n",
        "noisy_logger.setLevel(logging.WARNING)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yGz3c_A_sJBO",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 429
        },
        "outputId": "187b4d00-27fc-464b-da73-9fd47e2dc862"
      },
      "source": [
        "trainer.train()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='2410' max='2410' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [2410/2410 00:31, Epoch 10/10]\n",
              "    </div>\n",
              "    <table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: left;\">\n",
              "      <th>Epoch</th>\n",
              "      <th>Training Loss</th>\n",
              "      <th>Validation Loss</th>\n",
              "      <th>Precision</th>\n",
              "      <th>Recall</th>\n",
              "      <th>F1</th>\n",
              "      <th>Accuracy</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <td>1</td>\n",
              "      <td>No log</td>\n",
              "      <td>2.034866</td>\n",
              "      <td>0.032157</td>\n",
              "      <td>0.059487</td>\n",
              "      <td>0.041747</td>\n",
              "      <td>0.630991</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>2</td>\n",
              "      <td>No log</td>\n",
              "      <td>1.594469</td>\n",
              "      <td>0.042105</td>\n",
              "      <td>0.004881</td>\n",
              "      <td>0.008748</td>\n",
              "      <td>0.815724</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>3</td>\n",
              "      <td>2.043100</td>\n",
              "      <td>1.282439</td>\n",
              "      <td>0.052632</td>\n",
              "      <td>0.000305</td>\n",
              "      <td>0.000607</td>\n",
              "      <td>0.826314</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>4</td>\n",
              "      <td>2.043100</td>\n",
              "      <td>1.079169</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.826854</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>5</td>\n",
              "      <td>1.267200</td>\n",
              "      <td>0.954540</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.826896</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>6</td>\n",
              "      <td>1.267200</td>\n",
              "      <td>0.880644</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.826896</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>7</td>\n",
              "      <td>0.932500</td>\n",
              "      <td>0.837882</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.826896</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>8</td>\n",
              "      <td>0.932500</td>\n",
              "      <td>0.813664</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.826896</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>9</td>\n",
              "      <td>0.808700</td>\n",
              "      <td>0.801121</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.826896</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>10</td>\n",
              "      <td>0.808700</td>\n",
              "      <td>0.797258</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.826896</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table><p>"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='122' max='61' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [61/61 00:53]\n",
              "    </div>\n",
              "    "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "TrainOutput(global_step=2410, training_loss=1.181212188594074, metrics={'train_runtime': 31.5523, 'train_samples_per_second': 1219.246, 'train_steps_per_second': 76.381, 'total_flos': 35752217175750.0, 'train_loss': 1.181212188594074, 'epoch': 10.0})"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 39
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "H14j1R3cbDPO"
      },
      "source": [
        "Модель недообучилась: похоже, что нужно обучить больше слоёв. Разморозим их все (но, воможно, более правильно было бы разморозить лишь несколько верхних), и поучимся ещё эпох 20."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "65soVR9sbE77"
      },
      "source": [
        "# разморозка\n",
        "for param in model.parameters():\n",
        "    param.requires_grad = True"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "u-3sfj5ocug0",
        "outputId": "32b7bd21-4999-4f15-e1b8-cfd60dcdf4f8"
      },
      "source": [
        "args = TrainingArguments(\n",
        "    \"ner\",\n",
        "    evaluation_strategy = \"epoch\",\n",
        "    learning_rate=1e-5,\n",
        "    per_device_train_batch_size=batch_size,\n",
        "    per_device_eval_batch_size=batch_size,\n",
        "    num_train_epochs=20,\n",
        "    weight_decay=0.01,\n",
        "    save_strategy='no',\n",
        "    report_to='none',\n",
        ")"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "PyTorch: setting up devices\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wGW0r33pdLOy"
      },
      "source": [
        "trainer = Trainer(\n",
        "    model,\n",
        "    args,\n",
        "    train_dataset=tokenized_datasets[\"train\"],\n",
        "    eval_dataset=tokenized_datasets[\"test\"],\n",
        "    data_collator=data_collator,\n",
        "    tokenizer=tokenizer,\n",
        "    compute_metrics=compute_metrics\n",
        ")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 706
        },
        "id": "C5nZBs-BbFRq",
        "outputId": "61effef3-dff8-4296-f3d2-00dedd88011f"
      },
      "source": [
        "trainer.train()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='4820' max='4820' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [4820/4820 01:51, Epoch 20/20]\n",
              "    </div>\n",
              "    <table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: left;\">\n",
              "      <th>Epoch</th>\n",
              "      <th>Training Loss</th>\n",
              "      <th>Validation Loss</th>\n",
              "      <th>Precision</th>\n",
              "      <th>Recall</th>\n",
              "      <th>F1</th>\n",
              "      <th>Accuracy</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <td>1</td>\n",
              "      <td>No log</td>\n",
              "      <td>0.584139</td>\n",
              "      <td>0.703590</td>\n",
              "      <td>0.209274</td>\n",
              "      <td>0.322596</td>\n",
              "      <td>0.851981</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>2</td>\n",
              "      <td>No log</td>\n",
              "      <td>0.516597</td>\n",
              "      <td>0.603892</td>\n",
              "      <td>0.321843</td>\n",
              "      <td>0.419900</td>\n",
              "      <td>0.863070</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>3</td>\n",
              "      <td>0.571100</td>\n",
              "      <td>0.474432</td>\n",
              "      <td>0.609095</td>\n",
              "      <td>0.384076</td>\n",
              "      <td>0.471094</td>\n",
              "      <td>0.871252</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>4</td>\n",
              "      <td>0.571100</td>\n",
              "      <td>0.446405</td>\n",
              "      <td>0.624401</td>\n",
              "      <td>0.437157</td>\n",
              "      <td>0.514265</td>\n",
              "      <td>0.878769</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>5</td>\n",
              "      <td>0.446100</td>\n",
              "      <td>0.423940</td>\n",
              "      <td>0.619102</td>\n",
              "      <td>0.496339</td>\n",
              "      <td>0.550965</td>\n",
              "      <td>0.885414</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>6</td>\n",
              "      <td>0.446100</td>\n",
              "      <td>0.405271</td>\n",
              "      <td>0.620240</td>\n",
              "      <td>0.536608</td>\n",
              "      <td>0.575401</td>\n",
              "      <td>0.889733</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>7</td>\n",
              "      <td>0.387700</td>\n",
              "      <td>0.391646</td>\n",
              "      <td>0.630487</td>\n",
              "      <td>0.556437</td>\n",
              "      <td>0.591152</td>\n",
              "      <td>0.893222</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>8</td>\n",
              "      <td>0.387700</td>\n",
              "      <td>0.381404</td>\n",
              "      <td>0.606738</td>\n",
              "      <td>0.587858</td>\n",
              "      <td>0.597149</td>\n",
              "      <td>0.894468</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>9</td>\n",
              "      <td>0.349000</td>\n",
              "      <td>0.374620</td>\n",
              "      <td>0.603774</td>\n",
              "      <td>0.615009</td>\n",
              "      <td>0.609340</td>\n",
              "      <td>0.895922</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>10</td>\n",
              "      <td>0.349000</td>\n",
              "      <td>0.364899</td>\n",
              "      <td>0.621263</td>\n",
              "      <td>0.615009</td>\n",
              "      <td>0.618120</td>\n",
              "      <td>0.898787</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>11</td>\n",
              "      <td>0.320000</td>\n",
              "      <td>0.356865</td>\n",
              "      <td>0.638978</td>\n",
              "      <td>0.610128</td>\n",
              "      <td>0.624220</td>\n",
              "      <td>0.900573</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>12</td>\n",
              "      <td>0.320000</td>\n",
              "      <td>0.353724</td>\n",
              "      <td>0.621075</td>\n",
              "      <td>0.627517</td>\n",
              "      <td>0.624279</td>\n",
              "      <td>0.900365</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>13</td>\n",
              "      <td>0.304200</td>\n",
              "      <td>0.351088</td>\n",
              "      <td>0.612875</td>\n",
              "      <td>0.641855</td>\n",
              "      <td>0.627030</td>\n",
              "      <td>0.900947</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>14</td>\n",
              "      <td>0.304200</td>\n",
              "      <td>0.344875</td>\n",
              "      <td>0.635614</td>\n",
              "      <td>0.634838</td>\n",
              "      <td>0.635226</td>\n",
              "      <td>0.903273</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>15</td>\n",
              "      <td>0.290300</td>\n",
              "      <td>0.343057</td>\n",
              "      <td>0.632229</td>\n",
              "      <td>0.640329</td>\n",
              "      <td>0.636253</td>\n",
              "      <td>0.903107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>16</td>\n",
              "      <td>0.290300</td>\n",
              "      <td>0.340833</td>\n",
              "      <td>0.637323</td>\n",
              "      <td>0.644905</td>\n",
              "      <td>0.641092</td>\n",
              "      <td>0.903896</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>17</td>\n",
              "      <td>0.278900</td>\n",
              "      <td>0.338196</td>\n",
              "      <td>0.647566</td>\n",
              "      <td>0.641245</td>\n",
              "      <td>0.644390</td>\n",
              "      <td>0.904892</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>18</td>\n",
              "      <td>0.278900</td>\n",
              "      <td>0.337507</td>\n",
              "      <td>0.638947</td>\n",
              "      <td>0.651617</td>\n",
              "      <td>0.645220</td>\n",
              "      <td>0.905100</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>19</td>\n",
              "      <td>0.267100</td>\n",
              "      <td>0.336935</td>\n",
              "      <td>0.637556</td>\n",
              "      <td>0.652532</td>\n",
              "      <td>0.644957</td>\n",
              "      <td>0.904975</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>20</td>\n",
              "      <td>0.267100</td>\n",
              "      <td>0.336756</td>\n",
              "      <td>0.637094</td>\n",
              "      <td>0.652837</td>\n",
              "      <td>0.644870</td>\n",
              "      <td>0.905017</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table><p>"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "TrainOutput(global_step=4820, training_loss=0.35207317083208395, metrics={'train_runtime': 111.6227, 'train_samples_per_second': 689.286, 'train_steps_per_second': 43.181, 'total_flos': 71548216106580.0, 'train_loss': 0.35207317083208395, 'epoch': 20.0})"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 43
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CKASz-2vIrJi"
      },
      "source": [
        "The `evaluate` method allows you to evaluate again on the evaluation dataset or on another dataset:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "UOUcBkX8IrJi",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 188
        },
        "outputId": "c5772a91-7302-4f14-da69-c99eb281dcd1"
      },
      "source": [
        "trainer.evaluate()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='61' max='61' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [61/61 00:00]\n",
              "    </div>\n",
              "    "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'epoch': 20.0,\n",
              " 'eval_accuracy': 0.9050170279923582,\n",
              " 'eval_f1': 0.6448696700316409,\n",
              " 'eval_loss': 0.3367559015750885,\n",
              " 'eval_precision': 0.6370943733253944,\n",
              " 'eval_recall': 0.652837095790116,\n",
              " 'eval_runtime': 1.1185,\n",
              " 'eval_samples_per_second': 860.049,\n",
              " 'eval_steps_per_second': 54.535}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 44
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "BaMhVjZ-sJBO"
      },
      "source": [
        "To get the precision/recall/f1 computed for each category now that we have finished training, we can apply the same function as before on the result of the `predict` method:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wm8MsZ3tsJBO",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 509
        },
        "outputId": "6fdbd471-7987-47c3-cbd0-83a82bd45ce3"
      },
      "source": [
        "predictions, labels, _ = trainer.predict(tokenized_datasets[\"test\"])\n",
        "predictions = np.argmax(predictions, axis=2)\n",
        "\n",
        "# Remove ignored index (special tokens)\n",
        "true_predictions = [\n",
        "    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n",
        "    for prediction, label in zip(predictions, labels)\n",
        "]\n",
        "true_labels = [\n",
        "    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n",
        "    for prediction, label in zip(predictions, labels)\n",
        "]\n",
        "\n",
        "results = metric.compute(predictions=true_predictions, references=true_labels)\n",
        "results"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='122' max='61' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [61/61 00:08]\n",
              "    </div>\n",
              "    "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
            "  _warn_prf(average, modifier, msg_start, len(result))\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'ADR': {'f1': 0.30279898218829515,\n",
              "  'number': 446,\n",
              "  'precision': 0.35,\n",
              "  'recall': 0.26681614349775784},\n",
              " 'DI': {'f1': 0.493963782696177,\n",
              "  'number': 821,\n",
              "  'precision': 0.4207369323050557,\n",
              "  'recall': 0.5980511571254568},\n",
              " 'Drugclass': {'f1': 0.7868852459016393,\n",
              "  'number': 336,\n",
              "  'precision': 0.7880597014925373,\n",
              "  'recall': 0.7857142857142857},\n",
              " 'Drugform': {'f1': 0.7922794117647058,\n",
              "  'number': 565,\n",
              "  'precision': 0.8240917782026769,\n",
              "  'recall': 0.7628318584070797},\n",
              " 'Drugname': {'f1': 0.8734309623430963,\n",
              "  'number': 918,\n",
              "  'precision': 0.8400402414486922,\n",
              "  'recall': 0.9095860566448801},\n",
              " 'Finding': {'f1': 0.0, 'number': 192, 'precision': 0.0, 'recall': 0.0},\n",
              " 'overall_accuracy': 0.9050170279923582,\n",
              " 'overall_f1': 0.6448696700316409,\n",
              " 'overall_precision': 0.6370943733253944,\n",
              " 'overall_recall': 0.652837095790116}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 45
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nI18Xeda7X8a"
      },
      "source": [
        "from sklearn.metrics import confusion_matrix\n",
        "import pandas as pd"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 435
        },
        "id": "Yz9BkfrO7bg6",
        "outputId": "d6ce002d-0803-4320-8711-ec33bdc9c40d"
      },
      "source": [
        "cm = pd.DataFrame(\n",
        "    confusion_matrix(sum(true_labels, []), sum(true_predictions, []), labels=label_list),\n",
        "    index=label_list,\n",
        "    columns=label_list\n",
        ")\n",
        "cm"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>O</th>\n",
              "      <th>B-ADR</th>\n",
              "      <th>B-DI</th>\n",
              "      <th>B-Drugclass</th>\n",
              "      <th>B-Drugform</th>\n",
              "      <th>B-Drugname</th>\n",
              "      <th>B-Finding</th>\n",
              "      <th>I-ADR</th>\n",
              "      <th>I-DI</th>\n",
              "      <th>I-Drugclass</th>\n",
              "      <th>I-Drugform</th>\n",
              "      <th>I-Drugname</th>\n",
              "      <th>I-Finding</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>O</th>\n",
              "      <td>19494</td>\n",
              "      <td>29</td>\n",
              "      <td>175</td>\n",
              "      <td>35</td>\n",
              "      <td>60</td>\n",
              "      <td>71</td>\n",
              "      <td>0</td>\n",
              "      <td>20</td>\n",
              "      <td>26</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>B-ADR</th>\n",
              "      <td>159</td>\n",
              "      <td>135</td>\n",
              "      <td>133</td>\n",
              "      <td>8</td>\n",
              "      <td>2</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>4</td>\n",
              "      <td>5</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>B-DI</th>\n",
              "      <td>242</td>\n",
              "      <td>21</td>\n",
              "      <td>525</td>\n",
              "      <td>0</td>\n",
              "      <td>17</td>\n",
              "      <td>10</td>\n",
              "      <td>0</td>\n",
              "      <td>3</td>\n",
              "      <td>3</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>B-Drugclass</th>\n",
              "      <td>50</td>\n",
              "      <td>1</td>\n",
              "      <td>17</td>\n",
              "      <td>264</td>\n",
              "      <td>0</td>\n",
              "      <td>4</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>B-Drugform</th>\n",
              "      <td>98</td>\n",
              "      <td>4</td>\n",
              "      <td>11</td>\n",
              "      <td>1</td>\n",
              "      <td>432</td>\n",
              "      <td>17</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>B-Drugname</th>\n",
              "      <td>44</td>\n",
              "      <td>1</td>\n",
              "      <td>16</td>\n",
              "      <td>1</td>\n",
              "      <td>8</td>\n",
              "      <td>848</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>B-Finding</th>\n",
              "      <td>56</td>\n",
              "      <td>32</td>\n",
              "      <td>87</td>\n",
              "      <td>5</td>\n",
              "      <td>3</td>\n",
              "      <td>3</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>5</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>I-ADR</th>\n",
              "      <td>180</td>\n",
              "      <td>51</td>\n",
              "      <td>40</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>47</td>\n",
              "      <td>30</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>I-DI</th>\n",
              "      <td>236</td>\n",
              "      <td>17</td>\n",
              "      <td>102</td>\n",
              "      <td>10</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>11</td>\n",
              "      <td>46</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>I-Drugclass</th>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>4</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>I-Drugform</th>\n",
              "      <td>3</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>I-Drugname</th>\n",
              "      <td>19</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>39</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>I-Finding</th>\n",
              "      <td>25</td>\n",
              "      <td>7</td>\n",
              "      <td>6</td>\n",
              "      <td>7</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "      <td>5</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                 O  B-ADR  B-DI  ...  I-Drugform  I-Drugname  I-Finding\n",
              "O            19494     29   175  ...           0           0          0\n",
              "B-ADR          159    135   133  ...           0           0          0\n",
              "B-DI           242     21   525  ...           0           0          0\n",
              "B-Drugclass     50      1    17  ...           0           0          0\n",
              "B-Drugform      98      4    11  ...           0           0          0\n",
              "B-Drugname      44      1    16  ...           0           0          0\n",
              "B-Finding       56     32    87  ...           0           0          0\n",
              "I-ADR          180     51    40  ...           0           0          0\n",
              "I-DI           236     17   102  ...           0           0          0\n",
              "I-Drugclass      0      0     0  ...           0           0          0\n",
              "I-Drugform       3      0     0  ...           0           0          0\n",
              "I-Drugname      19      0     0  ...           0           0          0\n",
              "I-Finding       25      7     6  ...           0           0          0\n",
              "\n",
              "[13 rows x 13 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 47
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "cA0jWZwjVbI7",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "4161d7fe-c5e3-4f56-b9d4-52830e14da06"
      },
      "source": [
        "model.save_pretrained('ner_bert.bin')\n",
        "tokenizer.save_pretrained('ner_bert.bin')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Configuration saved in ner_bert.bin/config.json\n",
            "Model weights saved in ner_bert.bin/pytorch_model.bin\n",
            "tokenizer config file saved in ner_bert.bin/tokenizer_config.json\n",
            "Special tokens file saved in ner_bert.bin/special_tokens_map.json\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "('ner_bert.bin/tokenizer_config.json',\n",
              " 'ner_bert.bin/special_tokens_map.json',\n",
              " 'ner_bert.bin/vocab.txt',\n",
              " 'ner_bert.bin/added_tokens.json',\n",
              " 'ner_bert.bin/tokenizer.json')"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 48
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "C5yv9hItsJBP"
      },
      "source": [
        "# Применение модели"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "p0JHjRKmuv_m"
      },
      "source": [
        "import torch"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "Kp59uTtXZKT4",
        "outputId": "6ed0bcf0-7de7-4b93-936b-0562ef0505b4"
      },
      "source": [
        "text = ' '.join(ner_train[8]['tokens'])\n",
        "text = ' '.join(ner_test[4]['tokens'])\n",
        "text"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'Охотно применяю его при борьбе с насморком , что в моем случае явление очень частое .'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 50
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6h2hiUylZVmF"
      },
      "source": [
        "import torch"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Yt8EXbDuuB1U",
        "outputId": "bf2699a1-c42f-42bb-b00b-8adb7aca1edd"
      },
      "source": [
        "tokens = tokenizer(text, return_tensors='pt')\n",
        "tokens = {k: v.to(model.device) for k, v in tokens.items()}\n",
        "\n",
        "with torch.no_grad():\n",
        "    pred = model(**tokens)\n",
        "pred.logits.shape"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "torch.Size([1, 29, 13])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 52
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2GQPlbnyuu6H",
        "outputId": "e43272b6-3e22-44bb-ec7b-15a2acf570d3"
      },
      "source": [
        "indices = pred.logits.argmax(dim=-1)[0].cpu().numpy()\n",
        "token_text = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])\n",
        "for t, idx in zip(token_text, indices):\n",
        "    print(f'{t:15s} {label_list[idx]:10s}')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[CLS]           O         \n",
            "О               O         \n",
            "##хо            O         \n",
            "##тно           O         \n",
            "при             O         \n",
            "##мен           O         \n",
            "##я             O         \n",
            "##ю             O         \n",
            "его             O         \n",
            "при             O         \n",
            "борьбе          O         \n",
            "с               O         \n",
            "нас             B-DI      \n",
            "##мор           B-DI      \n",
            "##ком           B-DI      \n",
            ",               O         \n",
            "что             O         \n",
            "в               O         \n",
            "м               O         \n",
            "##ое            O         \n",
            "##м             O         \n",
            "случае          O         \n",
            "я               O         \n",
            "##вление        O         \n",
            "очень           O         \n",
            "часто           O         \n",
            "##е             O         \n",
            ".               O         \n",
            "[SEP]           O         \n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tBSq9enuwJ_V"
      },
      "source": [
        "Более простое применение модели: пайплайн от huggingface"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lnrAoy6b8swA"
      },
      "source": [
        "from transformers import pipeline"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uowfISMu8v1k"
      },
      "source": [
        "pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average', device=0)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1WjXMXCv9Nde",
        "outputId": "e2b4f6d6-6153-4bbd-915d-a3bc4d148895"
      },
      "source": [
        "print(text)\n",
        "print(pipe(text))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Охотно применяю его при борьбе с насморком , что в моем случае явление очень частое .\n",
            "[{'entity_group': 'DI', 'score': 0.73669535, 'word': 'насморком', 'start': 33, 'end': 42}]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "panjTvbH9PJL"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}