Skip to content

Instantly share code, notes, and snippets.

@avidale
Created August 24, 2021 10:03
Show Gist options
  • Save avidale/cacf235aebeaaf4c578389e1146c3c57 to your computer and use it in GitHub Desktop.
Save avidale/cacf235aebeaaf4c578389e1146c3c57 to your computer and use it in GitHub Desktop.
Bert-NER-ru
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Bert-NER-ru",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"38effe6811e0445ea6a06fbf62322cb2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_9c3e627f3f214708a0eada8855e345e1",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_4b4eaf184a9a482798ac08ff5febafb9",
"IPY_MODEL_c3566754043543218423794f629214d6",
"IPY_MODEL_b92ff60477c043e0b2bdaea88a70295b"
]
}
},
"9c3e627f3f214708a0eada8855e345e1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"4b4eaf184a9a482798ac08ff5febafb9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_5e07fb58967a4d03931682646417f324",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c8847e2fd0654eabb62848c927685ba4"
}
},
"c3566754043543218423794f629214d6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_bb59a31387e84bc18937ca0aafb36230",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 341,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 341,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_7acb711b2f994c0c8c292f7735ccfd15"
}
},
"b92ff60477c043e0b2bdaea88a70295b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_93c12e9ea0294f70b4dca310239b55e4",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 341/341 [00:00<00:00, 6.59kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_44595961fd42457496803732f50e813b"
}
},
"5e07fb58967a4d03931682646417f324": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"c8847e2fd0654eabb62848c927685ba4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"bb59a31387e84bc18937ca0aafb36230": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"7acb711b2f994c0c8c292f7735ccfd15": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"93c12e9ea0294f70b4dca310239b55e4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"44595961fd42457496803732f50e813b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"97db5fbd0b134002a15f1e89b2e2a871": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_4ac3dbe72f9b4dd2b27d91630736fbcc",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_937d10a0f080492ab64efbb53f6ebe83",
"IPY_MODEL_3dab395a7da545838fb6c54fc19420cc",
"IPY_MODEL_712ed73184ec4551a6891862b3295b22"
]
}
},
"4ac3dbe72f9b4dd2b27d91630736fbcc": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"937d10a0f080492ab64efbb53f6ebe83": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_5281ff75b7144218a6218c65d8789f48",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_bedec2ca0adc4239b35f94a23a9b3cdd"
}
},
"3dab395a7da545838fb6c54fc19420cc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_8263aecd8ead4ceabf6112e536ac6dbd",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 632,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 632,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_e7f42df803cc4ed6ae43ead43e283ed8"
}
},
"712ed73184ec4551a6891862b3295b22": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_57770139538d40c38c68112f622993f3",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 632/632 [00:00<00:00, 20.9kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c3caeafc16f44882938d7c1dcb56cb18"
}
},
"5281ff75b7144218a6218c65d8789f48": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"bedec2ca0adc4239b35f94a23a9b3cdd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"8263aecd8ead4ceabf6112e536ac6dbd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"e7f42df803cc4ed6ae43ead43e283ed8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"57770139538d40c38c68112f622993f3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"c3caeafc16f44882938d7c1dcb56cb18": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"0389bbf4dde74c4db0117a6f66c8808f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_bb94580fba5140078f7a4ac289308e6f",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_ff1d724506ef4ab1a73c4a6635a95cdb",
"IPY_MODEL_6568a4b7ebba4b66ae324cfb1e4c6c56",
"IPY_MODEL_37f3d4726084402ba1ff26773197b415"
]
}
},
"bb94580fba5140078f7a4ac289308e6f": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"ff1d724506ef4ab1a73c4a6635a95cdb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_6f02d9d6f80249e4949123b465d56584",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_f5e2e39b8c344f81b40f43c7645b6fae"
}
},
"6568a4b7ebba4b66ae324cfb1e4c6c56": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_2f8e9faf6afe4b278a4889a13e29fd68",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 241082,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 241082,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_ff12a26f63e643fbb301dffe15f375cf"
}
},
"37f3d4726084402ba1ff26773197b415": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_2fd0ded25025401d917bc2624b23a5ce",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 241k/241k [00:00<00:00, 3.11MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_fd461fa9ee1249dbb49e10bf7201f9d9"
}
},
"6f02d9d6f80249e4949123b465d56584": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"f5e2e39b8c344f81b40f43c7645b6fae": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"2f8e9faf6afe4b278a4889a13e29fd68": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"ff12a26f63e643fbb301dffe15f375cf": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"2fd0ded25025401d917bc2624b23a5ce": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"fd461fa9ee1249dbb49e10bf7201f9d9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"91e9fdb905864efa8d42ee7cb3680e08": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_d8aa99a5f7cb42b9b3ca28edb0b0a007",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_0f4c5d11ce814b83bb85389b3c5a4c5f",
"IPY_MODEL_62ae82032a8a403b989ee8e0ab06f58a",
"IPY_MODEL_89ccf6c81454461c9a94ee0b9820d4a9"
]
}
},
"d8aa99a5f7cb42b9b3ca28edb0b0a007": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"0f4c5d11ce814b83bb85389b3c5a4c5f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_c76403b23ddd4fd5a644e63e21da8358",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_303a6382c4ff45439a40c47e969306da"
}
},
"62ae82032a8a403b989ee8e0ab06f58a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_24bff1add1804f5ea46b5edaf4ca7428",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 468145,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 468145,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_6e2ff1b9214049188ac0587fb33b5352"
}
},
"89ccf6c81454461c9a94ee0b9820d4a9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_5f25f5e5ad604408bcb53d0b70f67f20",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 468k/468k [00:00<00:00, 5.63MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_76260400334d4c4f8985e69b4800ae28"
}
},
"c76403b23ddd4fd5a644e63e21da8358": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"303a6382c4ff45439a40c47e969306da": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"24bff1add1804f5ea46b5edaf4ca7428": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"6e2ff1b9214049188ac0587fb33b5352": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5f25f5e5ad604408bcb53d0b70f67f20": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"76260400334d4c4f8985e69b4800ae28": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"21fdf7eb6fb94da0bc9639b3f4ea7f00": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_20b1444148dc40f89b107e30dbcf6c0b",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_36dc0d7a19c04a47a06618e187ee894a",
"IPY_MODEL_791e009541a64d749330e6123ca7d87f",
"IPY_MODEL_06c63accbd2a4903b762ed21545bfbbe"
]
}
},
"20b1444148dc40f89b107e30dbcf6c0b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"36dc0d7a19c04a47a06618e187ee894a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_07fee35962004c8996c8acef923292eb",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_b3332e5f66ad4c6c830f28bc290cd4bd"
}
},
"791e009541a64d749330e6123ca7d87f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_b831ba8c276b4a1bb0ef7ae16a7a8fc9",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 112,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 112,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_7eff20962dfe422b9523c4e74f5372aa"
}
},
"06c63accbd2a4903b762ed21545bfbbe": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_663c10e13a0e47f7b115ad50bd5b3965",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 112/112 [00:00<00:00, 3.03kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_997504803ac5445588c07cf97049d14a"
}
},
"07fee35962004c8996c8acef923292eb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"b3332e5f66ad4c6c830f28bc290cd4bd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b831ba8c276b4a1bb0ef7ae16a7a8fc9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"7eff20962dfe422b9523c4e74f5372aa": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"663c10e13a0e47f7b115ad50bd5b3965": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"997504803ac5445588c07cf97049d14a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"e16827a6f03a4b92889daf18d9917126": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_7b3185264cfe469683ad9cc81b0a8484",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_20ea5f6227b041969b1ce0d686a39121",
"IPY_MODEL_78818c7330fd489c8820d845afab2fca",
"IPY_MODEL_19c53fdf63a0408e8ad85e56a94d7dcd"
]
}
},
"7b3185264cfe469683ad9cc81b0a8484": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"20ea5f6227b041969b1ce0d686a39121": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_43d9892c92f343369d714477c706d45a",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_91f00c3eed1b41d288a53cf829f88555"
}
},
"78818c7330fd489c8820d845afab2fca": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_f2f2a65d8c5d4627855526eccf8c68d7",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 4,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 4,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_2c4a9988fa90474ba9aa1f48bf03704a"
}
},
"19c53fdf63a0408e8ad85e56a94d7dcd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_ec76c271eccd45c7b8d28a15274b1d50",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 4/4 [00:00<00:00, 6.93ba/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_eefb8e1e66ff486e92c0ce618c12be66"
}
},
"43d9892c92f343369d714477c706d45a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"91f00c3eed1b41d288a53cf829f88555": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"f2f2a65d8c5d4627855526eccf8c68d7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"2c4a9988fa90474ba9aa1f48bf03704a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"ec76c271eccd45c7b8d28a15274b1d50": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"eefb8e1e66ff486e92c0ce618c12be66": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b27b3845581c4dcba258672ecde20982": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_16091398feea4bfb8e5c07a679934a3e",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_b6f776fb81874175a00f9d4569edf89c",
"IPY_MODEL_9ae961550c684fbda20bbec6043ca80e",
"IPY_MODEL_54e1fd908fed41c981e8bb39068da20c"
]
}
},
"16091398feea4bfb8e5c07a679934a3e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b6f776fb81874175a00f9d4569edf89c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_c46c29c17e5d41128bb8bc401c5c4c8c",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_d0470494b6be4bf0af4fc7007e285149"
}
},
"9ae961550c684fbda20bbec6043ca80e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_9c406cfcfcf9424181939d2f6009090a",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_d2e8a291b9c54a45ba9f812ec6d19fcc"
}
},
"54e1fd908fed41c981e8bb39068da20c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_6fb72f595fb14608ae7e6a20c2e410a9",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1/1 [00:00<00:00, 5.44ba/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_e174a5cbd06441329ccd8cb547b44503"
}
},
"c46c29c17e5d41128bb8bc401c5c4c8c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"d0470494b6be4bf0af4fc7007e285149": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9c406cfcfcf9424181939d2f6009090a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"d2e8a291b9c54a45ba9f812ec6d19fcc": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"6fb72f595fb14608ae7e6a20c2e410a9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"e174a5cbd06441329ccd8cb547b44503": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"178b2e70a03141c3a8d14c03b1024d34": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_a2a75ec023b64fb0813a0d3b9da549a3",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_6446a516afdf4406a5a8c876aa2a0179",
"IPY_MODEL_787dde6c71aa4303a3f1ae908bdf3288",
"IPY_MODEL_04e795570e2245e2844e7acfd36611e4"
]
}
},
"a2a75ec023b64fb0813a0d3b9da549a3": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"6446a516afdf4406a5a8c876aa2a0179": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_a0441eab19d441d7ae60ea926a1ddabe",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_00a1bad590e34fc09067414ed4bae1d9"
}
},
"787dde6c71aa4303a3f1ae908bdf3288": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_5b3223766ec44a0781e88c6f0d276c12",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 47679974,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 47679974,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_085cbbc409874d57b38930b6b05ecfd9"
}
},
"04e795570e2245e2844e7acfd36611e4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_1a255ac062e94624a2ecfb4f58889d74",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 47.7M/47.7M [00:01<00:00, 47.3MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_1d51207792fd4afb849e2ae72ddd68ce"
}
},
"a0441eab19d441d7ae60ea926a1ddabe": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"00a1bad590e34fc09067414ed4bae1d9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5b3223766ec44a0781e88c6f0d276c12": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"085cbbc409874d57b38930b6b05ecfd9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"1a255ac062e94624a2ecfb4f58889d74": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"1d51207792fd4afb849e2ae72ddd68ce": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"bba28fea430d436981b0bfab06fb4ee6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_fe16833673ba4dc79001d9b3d28eb6d2",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_513e9fa4a8a04f889d68ac7658818465",
"IPY_MODEL_b38e9ab8c6fa4ea1952a2265dcdcfff5",
"IPY_MODEL_91c93a7307854ac98d9bdf6746286517"
]
}
},
"fe16833673ba4dc79001d9b3d28eb6d2": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"513e9fa4a8a04f889d68ac7658818465": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_c291de1c9ba343ff8e140a8c7ef1496a",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: ",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_b1e72abd13c84f08b763eae36928fb4e"
}
},
"b38e9ab8c6fa4ea1952a2265dcdcfff5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_251fc9cb4f574aa4881b722cef11324a",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 2482,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 2482,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_71c5d3acf8674d77b38d6fa6b0aba8d5"
}
},
"91c93a7307854ac98d9bdf6746286517": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_94419c78831745928f8e5327f53ef5e0",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 6.34k/? [00:00<00:00, 157kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_31a5bb08896f47309e658dac697b6680"
}
},
"c291de1c9ba343ff8e140a8c7ef1496a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"b1e72abd13c84f08b763eae36928fb4e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"251fc9cb4f574aa4881b722cef11324a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"71c5d3acf8674d77b38d6fa6b0aba8d5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"94419c78831745928f8e5327f53ef5e0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"31a5bb08896f47309e658dac697b6680": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/avidale/cacf235aebeaaf4c578389e1146c3c57/bert-ner-ru.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "X4cRE8IbIrIV"
},
"source": [
"Основано на блокноте https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb"
]
},
{
"cell_type": "code",
"metadata": {
"id": "MOsHUjgdIrIW"
},
"source": [
"! pip install datasets transformers seqeval"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "4HL1yaESsJA9"
},
"source": [
"В этом блокноте мы дообучаем модель на задаче классификации отдельных слов, а именно, распознавание именованных сущностей (aka named entity recognition, aka NER). Мы возьмём датасет медицинских сущностей, но в целом пайплайн подходит для любой задачи на выделение сущностей в тексте. \n",
"\n",
"Для скорости мы возьмём маленький BERT для русского языка [rubert-tiny](https://huggingface.co/cointegrated/rubert-tiny); если взять другую, более крупную BERT-подобную модель, качество NER может быть выше, но и время обучения и работы будет дольше \n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4RRkXuteIrIh"
},
"source": [
"This notebook is built to run on any token classification task, with any model checkpoint from the [Model Hub](https://huggingface.co/models) as long as that model has a version with a token classification head and a fast tokenizer (check on [this table](https://huggingface.co/transformers/index.html#bigtable) if this is the case). It might just need some small adjustments if you decide to use a different dataset than the one used here. Depending on you model and the GPU you are using, you might need to adjust the batch size to avoid out-of-memory errors. Set those three parameters, then the rest of the notebook should run smoothly:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "zVvslsfMIrIh"
},
"source": [
"model_checkpoint = \"cointegrated/rubert-tiny\"\n",
"batch_size = 16"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "whPRbBNbIrIl"
},
"source": [
"## Loading the dataset"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "J8mt63rWvkv3"
},
"source": [
"Для обучения мы возьмём [Russian Drug Reaction Corpus](https://github.com/cimm-kzn/RuDReC): размеченный корпус русскоязычных отзывов на лекарства. \n",
"\n",
"Загрузим мы его библиотекой corus, потому что это удобно "
]
},
{
"cell_type": "code",
"metadata": {
"id": "IreSlFmlIrIm"
},
"source": [
"from datasets import load_dataset, load_metric"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "s_AY1ATSIrIq"
},
"source": [
"!wget https://github.com/cimm-kzn/RuDReC/raw/master/data/rudrec_annotated.json\n",
"!pip install corus razdel"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VALH-KBTMfVI",
"outputId": "385f3c15-6aa7-4c9a-bd02-8db5c3a593fb"
},
"source": [
"from corus import load_rudrec\n",
"drugs = list(load_rudrec('rudrec_annotated.json'))\n",
"print(len(drugs))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"4809\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fGBywJmAv2NN"
},
"source": [
"Пример документа:"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ynPlkV5gv4XC",
"outputId": "e3fe1c20-6f9d-4921-d56b-71d810de8143"
},
"source": [
"drugs[0]"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"RuDReCRecord(\n",
" file_name='172744.tsv',\n",
" text='нам прописали, так мой ребенок сыпью покрылся, глаза опухли, сверху и снизу на веках высыпала сыпь, ( 8 месяцев сыну)А от виферона такого не было... У кого ещё такие побочки, отзовитесь!1 Чем спасались?\\n',\n",
" sentence_id=0,\n",
" entities=[RuDReCEntity(\n",
" entity_id='*[0]_se',\n",
" entity_text='виферона',\n",
" entity_type='Drugform',\n",
" start=122,\n",
" end=130,\n",
" concept_id='C0021735',\n",
" concept_name=nan\n",
" ), RuDReCEntity(\n",
" entity_id='*[1]',\n",
" entity_text='сыпью покрылся',\n",
" entity_type='ADR',\n",
" start=31,\n",
" end=45,\n",
" concept_id='C0015230',\n",
" concept_name=nan\n",
" ), RuDReCEntity(\n",
" entity_id='*[2]',\n",
" entity_text='глаза опухли',\n",
" entity_type='ADR',\n",
" start=47,\n",
" end=59,\n",
" concept_id='C4760994',\n",
" concept_name=nan\n",
" ), RuDReCEntity(\n",
" entity_id='*[3]',\n",
" entity_text='на веках высыпала сыпь',\n",
" entity_type='ADR',\n",
" start=76,\n",
" end=98,\n",
" concept_id='C0015230',\n",
" concept_name=nan\n",
" )]\n",
")"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "iSpV6RLEwI5o"
},
"source": [
"Посмотрим, какие сущности есть: лекарства, форма лекарств, класс лекарств, показания к применению, побочки, и прочие болезни/симптомы.\n",
"\n",
"https://arxiv.org/abs/2004.03659\n",
"\n",
"* **DRUGNAME** Mentions of the brand name of a drug or product\n",
"ingredients/active compounds.\n",
"* **DRUGCLASS** Mentions of drug classes such as anti-inflammatory or\n",
"cardiovascular.\n",
"* **DRUGFORM** Mentions of routes of administration such as tablet\n",
"or liquid that describe the physical form in which\n",
"medication will be delivered into patient’s organism.\n",
"* **DI** Any indication/symptom that specifies the reason for\n",
"taking/prescribing the drug.\n",
"* **ADR** Mentions of untoward medical events that occur as a\n",
"consequence of drug intake and are not associated with\n",
"treated symptoms.\n",
"* **FINDING** Any DI or ADR that was not directly experienced by the\n",
"reporting patient or his/her family members, or related to\n",
"medical history/drug label, or any disease entities if the\n",
"annotator is not clear about type"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vo8MIceYNPjx",
"outputId": "23d8f04f-f7d3-4a20-b840-a8b915b8ab47"
},
"source": [
"from collections import Counter, defaultdict\n",
"type2text = defaultdict(Counter)\n",
"ents = Counter()\n",
"for item in drugs:\n",
" for e in item.entities:\n",
" ents[e.entity_type] += 1\n",
" type2text[e.entity_type][e.entity_text] += 1\n",
"\n",
"for k, v in ents.most_common():\n",
" print(k, v)\n",
" print(type2text[k].most_common(3))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"DI 1401\n",
"[('простуды', 64), ('ОРВИ', 47), ('профилактики', 42)]\n",
"Drugname 1043\n",
"[('Виферон', 33), ('Анаферон', 25), ('Циклоферон', 24)]\n",
"Drugform 836\n",
"[('таблетки', 154), ('таблеток', 79), ('свечи', 63)]\n",
"ADR 720\n",
"[('аллергия', 16), ('слабость', 13), ('диарея', 12)]\n",
"Drugclass 330\n",
"[('противовирусный', 21), ('противовирусное', 18), ('противовирусных', 13)]\n",
"Finding 236\n",
"[('аллергии', 12), ('температуры', 6), ('сонливости', 5)]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"id": "0Kszaqs8N0Ig",
"outputId": "34a697ef-c96d-40bb-c6ed-fe6f2c979ab1"
},
"source": [
"drugs[0].text"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"'нам прописали, так мой ребенок сыпью покрылся, глаза опухли, сверху и снизу на веках высыпала сыпь, ( 8 месяцев сыну)А от виферона такого не было... У кого ещё такие побочки, отзовитесь!1 Чем спасались?\\n'"
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "RzfPtOMoIrIu"
},
"source": [
"Напишем функцию, перекладывающую разметку сущностей на уровень слов. Будем использовать [IOB](https://en.wikipedia.org/wiki/Inside–outside–beginning_(tagging))-нотацию, чтобы разделять несколько сущностей одного типа, идущих подряд. "
]
},
{
"cell_type": "code",
"metadata": {
"id": "Dg9BL4Z_OcjY"
},
"source": [
"from razdel import tokenize\n",
"\n",
"def extract_labels(item):\n",
" raw_toks = list(tokenize(item.text))\n",
" words = [tok.text for tok in raw_toks]\n",
" word_labels = ['O'] * len(raw_toks)\n",
" char2word = [None] * len(item.text)\n",
" for i, word in enumerate(raw_toks):\n",
" char2word[word.start:word.stop] = [i] * len(word.text)\n",
"\n",
" for e in item.entities:\n",
" e_words = sorted({idx for idx in char2word[e.start:e.end] if idx is not None})\n",
" word_labels[e_words[0]] = 'B-' + e.entity_type\n",
" for idx in e_words[1:]:\n",
" word_labels[idx] = 'I-' + e.entity_type\n",
"\n",
" return {'tokens': words, 'tags': word_labels}"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "PCGwQAadOVA9",
"outputId": "cb55c0b3-bdc5-4b5c-feae-c560c38554cd"
},
"source": [
"print(extract_labels(drugs[0]))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"{'tokens': ['нам', 'прописали', ',', 'так', 'мой', 'ребенок', 'сыпью', 'покрылся', ',', 'глаза', 'опухли', ',', 'сверху', 'и', 'снизу', 'на', 'веках', 'высыпала', 'сыпь', ',', '(', '8', 'месяцев', 'сыну', ')', 'А', 'от', 'виферона', 'такого', 'не', 'было', '...', 'У', 'кого', 'ещё', 'такие', 'побочки', ',', 'отзовитесь', '!', '1', 'Чем', 'спасались', '?'], 'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'B-ADR', 'I-ADR', 'O', 'B-ADR', 'I-ADR', 'O', 'O', 'O', 'O', 'B-ADR', 'I-ADR', 'I-ADR', 'I-ADR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Drugform', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Chhlmjt8OEgn"
},
"source": [
"from sklearn.model_selection import train_test_split\n",
"ner_data = [extract_labels(item) for item in drugs]\n",
"ner_train, ner_test = train_test_split(ner_data, test_size=0.2, random_state=1)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "yvApziHbyUyR"
},
"source": [
"Пример данных"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 137
},
"id": "17yA19oFRwMk",
"outputId": "0f5322ef-f6cf-4099-a34c-d298a3a72f72"
},
"source": [
"import pandas as pd\n",
"pd.options.display.max_colwidth = 300\n",
"pd.DataFrame(ner_train).sample(3)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tokens</th>\n",
" <th>tags</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3132</th>\n",
" <td>[Но, в, 3, месяца, нам, ставили, гипертонус, ручек, и, ножек, .]</td>\n",
" <td>[O, O, O, O, O, O, B-DI, I-DI, I-DI, I-DI, O]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>355</th>\n",
" <td>[У, меня, двое, детей, .]</td>\n",
" <td>[O, O, O, O, O]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3101</th>\n",
" <td>[Не, спорю, наслышана, о, широте, его, применения, ,, но, нам, он, не, подошел, абсолютно, !]</td>\n",
" <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tokens tags\n",
"3132 [Но, в, 3, месяца, нам, ставили, гипертонус, ручек, и, ножек, .] [O, O, O, O, O, O, B-DI, I-DI, I-DI, I-DI, O]\n",
"355 [У, меня, двое, детей, .] [O, O, O, O, O]\n",
"3101 [Не, спорю, наслышана, о, широте, его, применения, ,, но, нам, он, не, подошел, абсолютно, !] [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sE0souTBykq1"
},
"source": [
"Соберём все виды меток в список. "
]
},
{
"cell_type": "code",
"metadata": {
"id": "16SRNc6csJBC",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "a06bc57e-5b17-4b45-8009-a7cbfc5e7592"
},
"source": [
"label_list = sorted({label for item in ner_train for label in item['tags']})\n",
"if 'O' in label_list:\n",
" label_list.remove('O')\n",
" label_list = ['O'] + label_list\n",
"label_list"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['O',\n",
" 'B-ADR',\n",
" 'B-DI',\n",
" 'B-Drugclass',\n",
" 'B-Drugform',\n",
" 'B-Drugname',\n",
" 'B-Finding',\n",
" 'I-ADR',\n",
" 'I-DI',\n",
" 'I-Drugclass',\n",
" 'I-Drugform',\n",
" 'I-Drugname',\n",
" 'I-Finding']"
]
},
"metadata": {
"tags": []
},
"execution_count": 13
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ckjbVWLoyYYf"
},
"source": [
"Сложим наши данные в объект [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), нативный для huggingface."
]
},
{
"cell_type": "code",
"metadata": {
"id": "4E3yy6wmUp-z"
},
"source": [
"from datasets import Dataset, DatasetDict"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3YavIR4eU5ZY",
"outputId": "8b2ae069-88ac-405a-83a8-ff4a73d7d215"
},
"source": [
"ner_data = DatasetDict({\n",
" 'train': Dataset.from_pandas(pd.DataFrame(ner_train)),\n",
" 'test': Dataset.from_pandas(pd.DataFrame(ner_test))\n",
"})\n",
"ner_data"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['tokens', 'tags'],\n",
" num_rows: 3847\n",
" })\n",
" test: Dataset({\n",
" features: ['tokens', 'tags'],\n",
" num_rows: 962\n",
" })\n",
"})"
]
},
"metadata": {
"tags": []
},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "n9qywopnIrJH"
},
"source": [
"## Preprocessing the data"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "YVx71GdAIrJH"
},
"source": [
"Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.\n",
"\n",
"To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:\n",
"\n",
"- we get a tokenizer that corresponds to the model architecture we want to use,\n",
"- we download the vocabulary used when pretraining this specific checkpoint.\n",
"\n",
"That vocabulary will be cached, so it's not downloaded again the next time we run the cell."
]
},
{
"cell_type": "code",
"metadata": {
"id": "eXNLu_-nIrJI",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 177,
"referenced_widgets": [
"38effe6811e0445ea6a06fbf62322cb2",
"9c3e627f3f214708a0eada8855e345e1",
"4b4eaf184a9a482798ac08ff5febafb9",
"c3566754043543218423794f629214d6",
"b92ff60477c043e0b2bdaea88a70295b",
"5e07fb58967a4d03931682646417f324",
"c8847e2fd0654eabb62848c927685ba4",
"bb59a31387e84bc18937ca0aafb36230",
"7acb711b2f994c0c8c292f7735ccfd15",
"93c12e9ea0294f70b4dca310239b55e4",
"44595961fd42457496803732f50e813b",
"97db5fbd0b134002a15f1e89b2e2a871",
"4ac3dbe72f9b4dd2b27d91630736fbcc",
"937d10a0f080492ab64efbb53f6ebe83",
"3dab395a7da545838fb6c54fc19420cc",
"712ed73184ec4551a6891862b3295b22",
"5281ff75b7144218a6218c65d8789f48",
"bedec2ca0adc4239b35f94a23a9b3cdd",
"8263aecd8ead4ceabf6112e536ac6dbd",
"e7f42df803cc4ed6ae43ead43e283ed8",
"57770139538d40c38c68112f622993f3",
"c3caeafc16f44882938d7c1dcb56cb18",
"0389bbf4dde74c4db0117a6f66c8808f",
"bb94580fba5140078f7a4ac289308e6f",
"ff1d724506ef4ab1a73c4a6635a95cdb",
"6568a4b7ebba4b66ae324cfb1e4c6c56",
"37f3d4726084402ba1ff26773197b415",
"6f02d9d6f80249e4949123b465d56584",
"f5e2e39b8c344f81b40f43c7645b6fae",
"2f8e9faf6afe4b278a4889a13e29fd68",
"ff12a26f63e643fbb301dffe15f375cf",
"2fd0ded25025401d917bc2624b23a5ce",
"fd461fa9ee1249dbb49e10bf7201f9d9",
"91e9fdb905864efa8d42ee7cb3680e08",
"d8aa99a5f7cb42b9b3ca28edb0b0a007",
"0f4c5d11ce814b83bb85389b3c5a4c5f",
"62ae82032a8a403b989ee8e0ab06f58a",
"89ccf6c81454461c9a94ee0b9820d4a9",
"c76403b23ddd4fd5a644e63e21da8358",
"303a6382c4ff45439a40c47e969306da",
"24bff1add1804f5ea46b5edaf4ca7428",
"6e2ff1b9214049188ac0587fb33b5352",
"5f25f5e5ad604408bcb53d0b70f67f20",
"76260400334d4c4f8985e69b4800ae28",
"21fdf7eb6fb94da0bc9639b3f4ea7f00",
"20b1444148dc40f89b107e30dbcf6c0b",
"36dc0d7a19c04a47a06618e187ee894a",
"791e009541a64d749330e6123ca7d87f",
"06c63accbd2a4903b762ed21545bfbbe",
"07fee35962004c8996c8acef923292eb",
"b3332e5f66ad4c6c830f28bc290cd4bd",
"b831ba8c276b4a1bb0ef7ae16a7a8fc9",
"7eff20962dfe422b9523c4e74f5372aa",
"663c10e13a0e47f7b115ad50bd5b3965",
"997504803ac5445588c07cf97049d14a"
]
},
"outputId": "61183128-8efa-44c0-83e5-1bddd1d0fc06"
},
"source": [
"from transformers import AutoTokenizer\n",
" \n",
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "38effe6811e0445ea6a06fbf62322cb2",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/341 [00:00<?, ?B/s]"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "97db5fbd0b134002a15f1e89b2e2a871",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/632 [00:00<?, ?B/s]"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0389bbf4dde74c4db0117a6f66c8808f",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/241k [00:00<?, ?B/s]"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "91e9fdb905864efa8d42ee7cb3680e08",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/468k [00:00<?, ?B/s]"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "21fdf7eb6fb94da0bc9639b3f4ea7f00",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/112 [00:00<?, ?B/s]"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "rowT4iCLIrJK"
},
"source": [
"You can directly call this tokenizer on one sentence:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "a5hBlsrHIrJL",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "685cf29d-15c9-40ac-e802-ae3035b0ca14"
},
"source": [
"tokenizer(\"Hello, this is one sentence!\")"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'input_ids': [2, 9944, 16, 881, 550, 835, 15503, 5, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}"
]
},
"metadata": {
"tags": []
},
"execution_count": 17
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZdStg37nsJBE"
},
"source": [
"Depending on the model you selected, you will see different keys in the dictionary returned by the cell above. They don't matter much for what we're doing here (just know they are required by the model we will instantiate later), you can learn more about them in [this tutorial](https://huggingface.co/transformers/preprocessing.html) if you're interested.\n",
"\n",
"If, as is the case here, your inputs have already been split into words, you should pass the list of words to your tokenzier with the argument `is_split_into_words=True`:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "b_yJ2hgDsJBF",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "211c06e6-ca98-46db-eac3-0f4c6809a400"
},
"source": [
"tokenizer([\"Hello\", \",\", \"this\", \"is\", \"one\", \"sentence\", \"split\", \"into\", \"words\", \".\"], is_split_into_words=True)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'input_ids': [2, 9944, 16, 881, 550, 835, 15503, 7440, 996, 6301, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
]
},
"metadata": {
"tags": []
},
"execution_count": 18
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4JdDuFvbsJBF"
},
"source": [
"Note that transformers are often pretrained with subword tokenizers, meaning that even if your inputs have been split into words already, each of those words could be split again by the tokenizer. Let's look at an example of that:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "OjrkjteOsJBF",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "a2841a74-4493-4417-cad8-e53d87875542"
},
"source": [
"example = ner_train[5]\n",
"print(example[\"tokens\"])"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"['Мы', 'поменяли', 'место', 'жительства', 'и', 'перевели', 'дочь', 'в', 'школу', ',', 'которая', 'находится', 'ближе', 'к', 'дому', '.']\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "QU8fkdJMsJBF",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "9eec8122-4df6-43d8-8b53-52b296e1b640"
},
"source": [
"tokenized_input = tokenizer(example[\"tokens\"], is_split_into_words=True)\n",
"tokens = tokenizer.convert_ids_to_tokens(tokenized_input[\"input_ids\"])\n",
"print(tokens)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"['[CLS]', 'Мы', 'пом', '##ен', '##яли', 'место', 'ж', '##итель', '##ства', 'и', 'пер', '##еве', '##ли', 'дочь', 'в', 'школу', ',', 'которая', 'находится', 'б', '##ли', '##же', 'к', 'дому', '.', '[SEP]']\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-7zwzY9wsJBG"
},
"source": [
"Чтобы перейти с уровня слов на уровень subword tokens, нужно ещё раз предобработать тексты."
]
},
{
"cell_type": "code",
"metadata": {
"id": "F39uz6wusJBG",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "b0d90db2-f812-4478-bb30-b57235a6a204"
},
"source": [
"len(example[\"tags\"]), len(tokenized_input[\"input_ids\"])"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(16, 26)"
]
},
"metadata": {
"tags": []
},
"execution_count": 21
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pnBazSrTsJBG"
},
"source": [
"Thankfully, the tokenizer returns outputs that have a `word_ids` method which can help us."
]
},
{
"cell_type": "code",
"metadata": {
"id": "Rt7_5_bXsJBH",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "90d5e41e-92e4-4199-bcac-f71e2dd3488c"
},
"source": [
"print(tokenized_input.word_ids())"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"[None, 0, 1, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 13, 14, 15, None]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "rP1PvW2isJBH"
},
"source": [
"As we can see, it returns a list with the same number of elements as our processed input ids, mapping special tokens to `None` and all other tokens to their respective word. This way, we can align the labels with the processed input ids."
]
},
{
"cell_type": "code",
"metadata": {
"id": "NeVhtoANsJBH",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "dcc7ae1b-6d60-4abb-c33f-ba9bad0bd859"
},
"source": [
"word_ids = tokenized_input.word_ids()\n",
"aligned_labels = [-100 if i is None else example[\"tags\"][i] for i in word_ids]\n",
"print(len(aligned_labels), len(tokenized_input[\"input_ids\"]))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"26 26\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "MM4fgSPDsJBH"
},
"source": [
"Here we set the labels of all special tokens to -100 (the index that is ignored by PyTorch) and the labels of all other tokens to the label of the word they come from. Another strategy is to set the label only on the first token obtained from a given word, and give a label of -100 to the other subtokens from the same word. We propose the two strategies here, just change the flag `label_all_tokens`."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2C0hcmp9IrJQ"
},
"source": [
"We're now ready to write the function that will preprocess our samples. We feed them to the `tokenizer` with the argument `truncation=True` (to truncate texts that are bigger than the maximum size allowed by the model) and `is_split_into_words=True` (as seen above). Then we align the labels with the token ids using the strategy we picked:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "vc0BSBLIIrJQ"
},
"source": [
"def tokenize_and_align_labels(examples, label_all_tokens=True):\n",
" tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n",
"\n",
" labels = []\n",
" for i, label in enumerate(examples['tags']):\n",
" word_ids = tokenized_inputs.word_ids(batch_index=i)\n",
" previous_word_idx = None\n",
" label_ids = []\n",
" for word_idx in word_ids:\n",
" # Special tokens have a word id that is None. We set the label to -100 so they are automatically\n",
" # ignored in the loss function.\n",
" if word_idx is None:\n",
" label_ids.append(-100)\n",
" # We set the label for the first token of each word.\n",
" elif word_idx != previous_word_idx:\n",
" label_ids.append(label[word_idx])\n",
" # For the other tokens in a word, we set the label to either the current label or -100, depending on\n",
" # the label_all_tokens flag.\n",
" else:\n",
" label_ids.append(label[word_idx] if label_all_tokens else -100)\n",
" previous_word_idx = word_idx\n",
"\n",
" label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]\n",
"\n",
" labels.append(label_ids)\n",
"\n",
" tokenized_inputs[\"labels\"] = labels\n",
" return tokenized_inputs"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "0lm8ozrJIrJR"
},
"source": [
"This function works with one or several examples. In the case of several examples, the tokenizer will return a list of lists for each key:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "-b70jh26IrJS",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "7d76f9ad-a433-444f-b118-2bdb9d2013cd"
},
"source": [
"tokenize_and_align_labels(ner_data['train'][22:23])"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'input_ids': [[2, 1041, 4033, 3236, 9267, 331, 19173, 19106, 26629, 1887, 22018, 548, 22276, 320, 21538, 16, 705, 13718, 22264, 548, 18397, 14063, 11137, 626, 16296, 24531, 18, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}"
]
},
"metadata": {
"tags": []
},
"execution_count": 25
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zS-6iXTkIrJT"
},
"source": [
"To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the `map` method of our `dataset` object we created earlier. This will apply the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command."
]
},
{
"cell_type": "code",
"metadata": {
"id": "DDtsaJeVIrJT",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 81,
"referenced_widgets": [
"e16827a6f03a4b92889daf18d9917126",
"7b3185264cfe469683ad9cc81b0a8484",
"20ea5f6227b041969b1ce0d686a39121",
"78818c7330fd489c8820d845afab2fca",
"19c53fdf63a0408e8ad85e56a94d7dcd",
"43d9892c92f343369d714477c706d45a",
"91f00c3eed1b41d288a53cf829f88555",
"f2f2a65d8c5d4627855526eccf8c68d7",
"2c4a9988fa90474ba9aa1f48bf03704a",
"ec76c271eccd45c7b8d28a15274b1d50",
"eefb8e1e66ff486e92c0ce618c12be66",
"b27b3845581c4dcba258672ecde20982",
"16091398feea4bfb8e5c07a679934a3e",
"b6f776fb81874175a00f9d4569edf89c",
"9ae961550c684fbda20bbec6043ca80e",
"54e1fd908fed41c981e8bb39068da20c",
"c46c29c17e5d41128bb8bc401c5c4c8c",
"d0470494b6be4bf0af4fc7007e285149",
"9c406cfcfcf9424181939d2f6009090a",
"d2e8a291b9c54a45ba9f812ec6d19fcc",
"6fb72f595fb14608ae7e6a20c2e410a9",
"e174a5cbd06441329ccd8cb547b44503"
]
},
"outputId": "b600760b-c8ce-4744-b0bc-fa777a6e9331"
},
"source": [
"tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e16827a6f03a4b92889daf18d9917126",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
" 0%| | 0/4 [00:00<?, ?ba/s]"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b27b3845581c4dcba258672ecde20982",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?ba/s]"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "voWiw8C7IrJV"
},
"source": [
"Even better, the results are automatically cached by the 🤗 Datasets library to avoid spending time on this step the next time you run your notebook. The 🤗 Datasets library is normally smart enough to detect when the function you pass to map has changed (and thus requires to not use the cache data). For instance, it will properly detect if you change the task in the first cell and rerun the notebook. 🤗 Datasets warns you when it uses cached files, you can pass `load_from_cache_file=False` in the call to `map` to not use the cached files and force the preprocessing to be applied again.\n",
"\n",
"Note that we passed `batched=True` to encode the texts by batches together. This is to leverage the full benefit of the fast tokenizer we loaded earlier, which will use multi-threading to treat the texts in a batch concurrently."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "545PP3o8IrJV"
},
"source": [
"## Fine-tuning the model"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "FBiW8UpKIrJW"
},
"source": [
"Now that our data is ready, we can download the pretrained model and fine-tune it. Since all our tasks are about token classification, we use the `AutoModelForTokenClassification` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us. The only thing we have to specify is the number of labels for our problem (which we can get from the features, as seen before):"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4d2zEASrUs1E",
"outputId": "7f0158f6-0aae-4fac-a11c-ad4996558bfc"
},
"source": [
"label_list"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['O',\n",
" 'B-ADR',\n",
" 'B-DI',\n",
" 'B-Drugclass',\n",
" 'B-Drugform',\n",
" 'B-Drugname',\n",
" 'B-Finding',\n",
" 'I-ADR',\n",
" 'I-DI',\n",
" 'I-Drugclass',\n",
" 'I-Drugform',\n",
" 'I-Drugname',\n",
" 'I-Finding']"
]
},
"metadata": {
"tags": []
},
"execution_count": 28
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "TlqNaB8jIrJW",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 152,
"referenced_widgets": [
"178b2e70a03141c3a8d14c03b1024d34",
"a2a75ec023b64fb0813a0d3b9da549a3",
"6446a516afdf4406a5a8c876aa2a0179",
"787dde6c71aa4303a3f1ae908bdf3288",
"04e795570e2245e2844e7acfd36611e4",
"a0441eab19d441d7ae60ea926a1ddabe",
"00a1bad590e34fc09067414ed4bae1d9",
"5b3223766ec44a0781e88c6f0d276c12",
"085cbbc409874d57b38930b6b05ecfd9",
"1a255ac062e94624a2ecfb4f58889d74",
"1d51207792fd4afb849e2ae72ddd68ce"
]
},
"outputId": "4adff280-2147-4280-aff7-25b4b22c22f1"
},
"source": [
"from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer\n",
"\n",
"model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))\n",
"model.config.id2label = dict(enumerate(label_list))\n",
"model.config.label2id = {v: k for k, v in model.config.id2label.items()}"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "178b2e70a03141c3a8d14c03b1024d34",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/47.7M [00:00<?, ?B/s]"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']\n",
"- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "CczA5lJlIrJX"
},
"source": [
"The warning is telling us we are throwing away some weights (the `vocab_transform` and `vocab_layer_norm` layers) and randomly initializing some other (the `pre_classifier` and `classifier` layers). This is absolutely normal in this case, because we are removing the head used to pretrain the model on a masked language modeling objective and replacing it with a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_N8urzhyIrJY"
},
"source": [
"To instantiate a `Trainer`, we will need to define three more things. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Bliy8zgjIrJY"
},
"source": [
"args = TrainingArguments(\n",
" \"ner\",\n",
" evaluation_strategy = \"epoch\",\n",
" learning_rate=2e-5,\n",
" per_device_train_batch_size=batch_size,\n",
" per_device_eval_batch_size=batch_size,\n",
" num_train_epochs=10,\n",
" weight_decay=0.01,\n",
" save_strategy='no',\n",
" report_to='none',\n",
")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "km3pGVdTIrJc"
},
"source": [
"Here we set the evaluation to be done at the end of each epoch, tweak the learning rate, use the `batch_size` defined at the top of the notebook and customize the number of epochs for training, as well as the weight decay."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4e6jrE3TsJBM"
},
"source": [
"Then we will need a data collator that will batch our processed examples together while applying padding to make them all the same size (each pad will be padded to the length of its longest example). There is a data collator for this task in the Transformers library, that not only pads the inputs, but also the labels:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "pyiUUwuCsJBM"
},
"source": [
"from transformers import DataCollatorForTokenClassification\n",
"\n",
"data_collator = DataCollatorForTokenClassification(tokenizer)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "YY7DtOOesJBM"
},
"source": [
"The last thing to define for our `Trainer` is how to compute the metrics from the predictions. Here we will load the [`seqeval`](https://github.com/chakki-works/seqeval) metric (which is commonly used to evaluate results on the CONLL dataset) via the Datasets library."
]
},
{
"cell_type": "code",
"metadata": {
"id": "qFF2_ArssJBM",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 49,
"referenced_widgets": [
"bba28fea430d436981b0bfab06fb4ee6",
"fe16833673ba4dc79001d9b3d28eb6d2",
"513e9fa4a8a04f889d68ac7658818465",
"b38e9ab8c6fa4ea1952a2265dcdcfff5",
"91c93a7307854ac98d9bdf6746286517",
"c291de1c9ba343ff8e140a8c7ef1496a",
"b1e72abd13c84f08b763eae36928fb4e",
"251fc9cb4f574aa4881b722cef11324a",
"71c5d3acf8674d77b38d6fa6b0aba8d5",
"94419c78831745928f8e5327f53ef5e0",
"31a5bb08896f47309e658dac697b6680"
]
},
"outputId": "d0e04cc6-0334-48a3-851e-9688ee5127cf"
},
"source": [
"metric = load_metric(\"seqeval\")"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bba28fea430d436981b0bfab06fb4ee6",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/2.48k [00:00<?, ?B/s]"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Ennxn1jysJBM"
},
"source": [
"This metric takes list of labels for the predictions and references:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "YOfoAVULsJBN",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "93aa3610-aa5b-4318-c3d2-02171e9596e9"
},
"source": [
"example = ner_train[4]\n",
"labels = example['tags']\n",
"metric.compute(predictions=[labels], references=[labels])"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'DI': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},\n",
" 'Drugform': {'f1': 1.0, 'number': 2, 'precision': 1.0, 'recall': 1.0},\n",
" 'overall_accuracy': 1.0,\n",
" 'overall_f1': 1.0,\n",
" 'overall_precision': 1.0,\n",
" 'overall_recall': 1.0}"
]
},
"metadata": {
"tags": []
},
"execution_count": 32
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7sZOdRlRIrJd"
},
"source": [
"So we will need to do a bit of post-processing on our predictions:\n",
"- select the predicted index (with the maximum logit) for each token\n",
"- convert it to its string label\n",
"- ignore everywhere we set a label of -100\n",
"\n",
"The following function does all this post-processing on the result of `Trainer.evaluate` (which is a namedtuple containing predictions and labels) before applying the metric:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "UmvbnJ9JIrJd"
},
"source": [
"import numpy as np\n",
"\n",
"def compute_metrics(p):\n",
" predictions, labels = p\n",
" predictions = np.argmax(predictions, axis=2)\n",
"\n",
" # Remove ignored index (special tokens)\n",
" true_predictions = [\n",
" [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n",
" for prediction, label in zip(predictions, labels)\n",
" ]\n",
" true_labels = [\n",
" [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n",
" for prediction, label in zip(predictions, labels)\n",
" ]\n",
"\n",
" results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)\n",
" return {\n",
" \"precision\": results[\"overall_precision\"],\n",
" \"recall\": results[\"overall_recall\"],\n",
" \"f1\": results[\"overall_f1\"],\n",
" \"accuracy\": results[\"overall_accuracy\"],\n",
" }"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "rXuFTAzDIrJe"
},
"source": [
"Note that we drop the precision/recall/f1 computed for each category and only focus on the overall precision/recall/f1/accuracy.\n",
"\n",
"Then we just need to pass all of this along with our datasets to the `Trainer`:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "imY1oC3SIrJf"
},
"source": [
"trainer = Trainer(\n",
" model,\n",
" args,\n",
" train_dataset=tokenized_datasets[\"train\"],\n",
" eval_dataset=tokenized_datasets[\"test\"],\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics\n",
")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 238
},
"id": "ZP1dvvNlXg9Y",
"outputId": "5615b5dc-3794-48d4-af11-12d9a6a0eac6"
},
"source": [
"trainer.evaluate()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags.\n",
"***** Running Evaluation *****\n",
" Num examples = 962\n",
" Batch size = 16\n"
],
"name": "stderr"
},
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='61' max='61' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [61/61 00:00]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'eval_accuracy': 0.07571226846083562,\n",
" 'eval_f1': 0.03137110167927662,\n",
" 'eval_loss': 2.604278326034546,\n",
" 'eval_precision': 0.018480269594521145,\n",
" 'eval_recall': 0.10372178157413056,\n",
" 'eval_runtime': 1.5067,\n",
" 'eval_samples_per_second': 638.492,\n",
" 'eval_steps_per_second': 40.486}"
]
},
"metadata": {
"tags": []
},
"execution_count": 35
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "a-xw5JvKzyrf"
},
"source": [
"В начале обучения заморозим все параметры в модели, кроме последнего слоя, и посмотрим, насколько хорошо она обучится."
]
},
{
"cell_type": "code",
"metadata": {
"id": "lzwwl_YQWKxq"
},
"source": [
"for param in model.bert.parameters():\n",
" param.requires_grad = False"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EhRisAHxWZRG",
"outputId": "95232006-ea19-46f8-a893-4fdfa44805f1"
},
"source": [
"for name, param in model.named_parameters():\n",
" if param.requires_grad:\n",
" print(name)\n",
" print(param)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"classifier.weight\n",
"Parameter containing:\n",
"tensor([[-5.3295e-02, 8.1591e-05, -1.4091e-02, ..., 9.4435e-03,\n",
" 2.6371e-02, -2.7459e-02],\n",
" [-1.4154e-02, 1.8980e-02, -6.4149e-03, ..., -3.0063e-02,\n",
" -8.0335e-03, -1.3474e-02],\n",
" [ 3.9226e-03, -1.7339e-03, -2.4043e-03, ..., 1.1911e-02,\n",
" -6.8623e-03, -3.6764e-02],\n",
" ...,\n",
" [ 2.9699e-02, -2.5830e-02, 2.9956e-03, ..., 2.0724e-02,\n",
" 2.6304e-02, -1.3127e-04],\n",
" [-2.8258e-02, 1.9521e-03, -1.2629e-02, ..., -2.4292e-02,\n",
" -1.9133e-02, 3.5226e-02],\n",
" [ 4.8563e-03, -3.9019e-02, 2.2573e-02, ..., 2.3094e-02,\n",
" -5.4334e-03, -3.1281e-02]], device='cuda:0', requires_grad=True)\n",
"classifier.bias\n",
"Parameter containing:\n",
"tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0',\n",
" requires_grad=True)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "CdzABDVcIrJg"
},
"source": [
"We can now finetune our model by just calling the `train` method:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "nsuTXCjMeYHE"
},
"source": [
"import logging\n",
"from transformers.trainer import logger as noisy_logger\n",
"noisy_logger.setLevel(logging.WARNING)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "yGz3c_A_sJBO",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 429
},
"outputId": "187b4d00-27fc-464b-da73-9fd47e2dc862"
},
"source": [
"trainer.train()"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='2410' max='2410' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [2410/2410 00:31, Epoch 10/10]\n",
" </div>\n",
" <table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>Epoch</th>\n",
" <th>Training Loss</th>\n",
" <th>Validation Loss</th>\n",
" <th>Precision</th>\n",
" <th>Recall</th>\n",
" <th>F1</th>\n",
" <th>Accuracy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>No log</td>\n",
" <td>2.034866</td>\n",
" <td>0.032157</td>\n",
" <td>0.059487</td>\n",
" <td>0.041747</td>\n",
" <td>0.630991</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>No log</td>\n",
" <td>1.594469</td>\n",
" <td>0.042105</td>\n",
" <td>0.004881</td>\n",
" <td>0.008748</td>\n",
" <td>0.815724</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>2.043100</td>\n",
" <td>1.282439</td>\n",
" <td>0.052632</td>\n",
" <td>0.000305</td>\n",
" <td>0.000607</td>\n",
" <td>0.826314</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>2.043100</td>\n",
" <td>1.079169</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.826854</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>1.267200</td>\n",
" <td>0.954540</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.826896</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>1.267200</td>\n",
" <td>0.880644</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.826896</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>0.932500</td>\n",
" <td>0.837882</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.826896</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>0.932500</td>\n",
" <td>0.813664</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.826896</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>0.808700</td>\n",
" <td>0.801121</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.826896</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>0.808700</td>\n",
" <td>0.797258</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.826896</td>\n",
" </tr>\n",
" </tbody>\n",
"</table><p>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='122' max='61' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [61/61 00:53]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"TrainOutput(global_step=2410, training_loss=1.181212188594074, metrics={'train_runtime': 31.5523, 'train_samples_per_second': 1219.246, 'train_steps_per_second': 76.381, 'total_flos': 35752217175750.0, 'train_loss': 1.181212188594074, 'epoch': 10.0})"
]
},
"metadata": {
"tags": []
},
"execution_count": 39
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "H14j1R3cbDPO"
},
"source": [
"Модель недообучилась: похоже, что нужно обучить больше слоёв. Разморозим их все (но, воможно, более правильно было бы разморозить лишь несколько верхних), и поучимся ещё эпох 20."
]
},
{
"cell_type": "code",
"metadata": {
"id": "65soVR9sbE77"
},
"source": [
"# разморозка\n",
"for param in model.parameters():\n",
" param.requires_grad = True"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "u-3sfj5ocug0",
"outputId": "32b7bd21-4999-4f15-e1b8-cfd60dcdf4f8"
},
"source": [
"args = TrainingArguments(\n",
" \"ner\",\n",
" evaluation_strategy = \"epoch\",\n",
" learning_rate=1e-5,\n",
" per_device_train_batch_size=batch_size,\n",
" per_device_eval_batch_size=batch_size,\n",
" num_train_epochs=20,\n",
" weight_decay=0.01,\n",
" save_strategy='no',\n",
" report_to='none',\n",
")"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"PyTorch: setting up devices\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "wGW0r33pdLOy"
},
"source": [
"trainer = Trainer(\n",
" model,\n",
" args,\n",
" train_dataset=tokenized_datasets[\"train\"],\n",
" eval_dataset=tokenized_datasets[\"test\"],\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics\n",
")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 706
},
"id": "C5nZBs-BbFRq",
"outputId": "61effef3-dff8-4296-f3d2-00dedd88011f"
},
"source": [
"trainer.train()"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='4820' max='4820' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [4820/4820 01:51, Epoch 20/20]\n",
" </div>\n",
" <table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>Epoch</th>\n",
" <th>Training Loss</th>\n",
" <th>Validation Loss</th>\n",
" <th>Precision</th>\n",
" <th>Recall</th>\n",
" <th>F1</th>\n",
" <th>Accuracy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>No log</td>\n",
" <td>0.584139</td>\n",
" <td>0.703590</td>\n",
" <td>0.209274</td>\n",
" <td>0.322596</td>\n",
" <td>0.851981</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>No log</td>\n",
" <td>0.516597</td>\n",
" <td>0.603892</td>\n",
" <td>0.321843</td>\n",
" <td>0.419900</td>\n",
" <td>0.863070</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.571100</td>\n",
" <td>0.474432</td>\n",
" <td>0.609095</td>\n",
" <td>0.384076</td>\n",
" <td>0.471094</td>\n",
" <td>0.871252</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.571100</td>\n",
" <td>0.446405</td>\n",
" <td>0.624401</td>\n",
" <td>0.437157</td>\n",
" <td>0.514265</td>\n",
" <td>0.878769</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0.446100</td>\n",
" <td>0.423940</td>\n",
" <td>0.619102</td>\n",
" <td>0.496339</td>\n",
" <td>0.550965</td>\n",
" <td>0.885414</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0.446100</td>\n",
" <td>0.405271</td>\n",
" <td>0.620240</td>\n",
" <td>0.536608</td>\n",
" <td>0.575401</td>\n",
" <td>0.889733</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>0.387700</td>\n",
" <td>0.391646</td>\n",
" <td>0.630487</td>\n",
" <td>0.556437</td>\n",
" <td>0.591152</td>\n",
" <td>0.893222</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>0.387700</td>\n",
" <td>0.381404</td>\n",
" <td>0.606738</td>\n",
" <td>0.587858</td>\n",
" <td>0.597149</td>\n",
" <td>0.894468</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>0.349000</td>\n",
" <td>0.374620</td>\n",
" <td>0.603774</td>\n",
" <td>0.615009</td>\n",
" <td>0.609340</td>\n",
" <td>0.895922</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>0.349000</td>\n",
" <td>0.364899</td>\n",
" <td>0.621263</td>\n",
" <td>0.615009</td>\n",
" <td>0.618120</td>\n",
" <td>0.898787</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>0.320000</td>\n",
" <td>0.356865</td>\n",
" <td>0.638978</td>\n",
" <td>0.610128</td>\n",
" <td>0.624220</td>\n",
" <td>0.900573</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>0.320000</td>\n",
" <td>0.353724</td>\n",
" <td>0.621075</td>\n",
" <td>0.627517</td>\n",
" <td>0.624279</td>\n",
" <td>0.900365</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>0.304200</td>\n",
" <td>0.351088</td>\n",
" <td>0.612875</td>\n",
" <td>0.641855</td>\n",
" <td>0.627030</td>\n",
" <td>0.900947</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>0.304200</td>\n",
" <td>0.344875</td>\n",
" <td>0.635614</td>\n",
" <td>0.634838</td>\n",
" <td>0.635226</td>\n",
" <td>0.903273</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>0.290300</td>\n",
" <td>0.343057</td>\n",
" <td>0.632229</td>\n",
" <td>0.640329</td>\n",
" <td>0.636253</td>\n",
" <td>0.903107</td>\n",
" </tr>\n",
" <tr>\n",
" <td>16</td>\n",
" <td>0.290300</td>\n",
" <td>0.340833</td>\n",
" <td>0.637323</td>\n",
" <td>0.644905</td>\n",
" <td>0.641092</td>\n",
" <td>0.903896</td>\n",
" </tr>\n",
" <tr>\n",
" <td>17</td>\n",
" <td>0.278900</td>\n",
" <td>0.338196</td>\n",
" <td>0.647566</td>\n",
" <td>0.641245</td>\n",
" <td>0.644390</td>\n",
" <td>0.904892</td>\n",
" </tr>\n",
" <tr>\n",
" <td>18</td>\n",
" <td>0.278900</td>\n",
" <td>0.337507</td>\n",
" <td>0.638947</td>\n",
" <td>0.651617</td>\n",
" <td>0.645220</td>\n",
" <td>0.905100</td>\n",
" </tr>\n",
" <tr>\n",
" <td>19</td>\n",
" <td>0.267100</td>\n",
" <td>0.336935</td>\n",
" <td>0.637556</td>\n",
" <td>0.652532</td>\n",
" <td>0.644957</td>\n",
" <td>0.904975</td>\n",
" </tr>\n",
" <tr>\n",
" <td>20</td>\n",
" <td>0.267100</td>\n",
" <td>0.336756</td>\n",
" <td>0.637094</td>\n",
" <td>0.652837</td>\n",
" <td>0.644870</td>\n",
" <td>0.905017</td>\n",
" </tr>\n",
" </tbody>\n",
"</table><p>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"TrainOutput(global_step=4820, training_loss=0.35207317083208395, metrics={'train_runtime': 111.6227, 'train_samples_per_second': 689.286, 'train_steps_per_second': 43.181, 'total_flos': 71548216106580.0, 'train_loss': 0.35207317083208395, 'epoch': 20.0})"
]
},
"metadata": {
"tags": []
},
"execution_count": 43
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "CKASz-2vIrJi"
},
"source": [
"The `evaluate` method allows you to evaluate again on the evaluation dataset or on another dataset:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "UOUcBkX8IrJi",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 188
},
"outputId": "c5772a91-7302-4f14-da69-c99eb281dcd1"
},
"source": [
"trainer.evaluate()"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='61' max='61' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [61/61 00:00]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'epoch': 20.0,\n",
" 'eval_accuracy': 0.9050170279923582,\n",
" 'eval_f1': 0.6448696700316409,\n",
" 'eval_loss': 0.3367559015750885,\n",
" 'eval_precision': 0.6370943733253944,\n",
" 'eval_recall': 0.652837095790116,\n",
" 'eval_runtime': 1.1185,\n",
" 'eval_samples_per_second': 860.049,\n",
" 'eval_steps_per_second': 54.535}"
]
},
"metadata": {
"tags": []
},
"execution_count": 44
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BaMhVjZ-sJBO"
},
"source": [
"To get the precision/recall/f1 computed for each category now that we have finished training, we can apply the same function as before on the result of the `predict` method:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "wm8MsZ3tsJBO",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 509
},
"outputId": "6fdbd471-7987-47c3-cbd0-83a82bd45ce3"
},
"source": [
"predictions, labels, _ = trainer.predict(tokenized_datasets[\"test\"])\n",
"predictions = np.argmax(predictions, axis=2)\n",
"\n",
"# Remove ignored index (special tokens)\n",
"true_predictions = [\n",
" [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n",
" for prediction, label in zip(predictions, labels)\n",
"]\n",
"true_labels = [\n",
" [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n",
" for prediction, label in zip(predictions, labels)\n",
"]\n",
"\n",
"results = metric.compute(predictions=true_predictions, references=true_labels)\n",
"results"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='122' max='61' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [61/61 00:08]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.7/dist-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n"
],
"name": "stderr"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'ADR': {'f1': 0.30279898218829515,\n",
" 'number': 446,\n",
" 'precision': 0.35,\n",
" 'recall': 0.26681614349775784},\n",
" 'DI': {'f1': 0.493963782696177,\n",
" 'number': 821,\n",
" 'precision': 0.4207369323050557,\n",
" 'recall': 0.5980511571254568},\n",
" 'Drugclass': {'f1': 0.7868852459016393,\n",
" 'number': 336,\n",
" 'precision': 0.7880597014925373,\n",
" 'recall': 0.7857142857142857},\n",
" 'Drugform': {'f1': 0.7922794117647058,\n",
" 'number': 565,\n",
" 'precision': 0.8240917782026769,\n",
" 'recall': 0.7628318584070797},\n",
" 'Drugname': {'f1': 0.8734309623430963,\n",
" 'number': 918,\n",
" 'precision': 0.8400402414486922,\n",
" 'recall': 0.9095860566448801},\n",
" 'Finding': {'f1': 0.0, 'number': 192, 'precision': 0.0, 'recall': 0.0},\n",
" 'overall_accuracy': 0.9050170279923582,\n",
" 'overall_f1': 0.6448696700316409,\n",
" 'overall_precision': 0.6370943733253944,\n",
" 'overall_recall': 0.652837095790116}"
]
},
"metadata": {
"tags": []
},
"execution_count": 45
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "nI18Xeda7X8a"
},
"source": [
"from sklearn.metrics import confusion_matrix\n",
"import pandas as pd"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 435
},
"id": "Yz9BkfrO7bg6",
"outputId": "d6ce002d-0803-4320-8711-ec33bdc9c40d"
},
"source": [
"cm = pd.DataFrame(\n",
" confusion_matrix(sum(true_labels, []), sum(true_predictions, []), labels=label_list),\n",
" index=label_list,\n",
" columns=label_list\n",
")\n",
"cm"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>O</th>\n",
" <th>B-ADR</th>\n",
" <th>B-DI</th>\n",
" <th>B-Drugclass</th>\n",
" <th>B-Drugform</th>\n",
" <th>B-Drugname</th>\n",
" <th>B-Finding</th>\n",
" <th>I-ADR</th>\n",
" <th>I-DI</th>\n",
" <th>I-Drugclass</th>\n",
" <th>I-Drugform</th>\n",
" <th>I-Drugname</th>\n",
" <th>I-Finding</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>O</th>\n",
" <td>19494</td>\n",
" <td>29</td>\n",
" <td>175</td>\n",
" <td>35</td>\n",
" <td>60</td>\n",
" <td>71</td>\n",
" <td>0</td>\n",
" <td>20</td>\n",
" <td>26</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B-ADR</th>\n",
" <td>159</td>\n",
" <td>135</td>\n",
" <td>133</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B-DI</th>\n",
" <td>242</td>\n",
" <td>21</td>\n",
" <td>525</td>\n",
" <td>0</td>\n",
" <td>17</td>\n",
" <td>10</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B-Drugclass</th>\n",
" <td>50</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>264</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B-Drugform</th>\n",
" <td>98</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>1</td>\n",
" <td>432</td>\n",
" <td>17</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B-Drugname</th>\n",
" <td>44</td>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>8</td>\n",
" <td>848</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B-Finding</th>\n",
" <td>56</td>\n",
" <td>32</td>\n",
" <td>87</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I-ADR</th>\n",
" <td>180</td>\n",
" <td>51</td>\n",
" <td>40</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" <td>30</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I-DI</th>\n",
" <td>236</td>\n",
" <td>17</td>\n",
" <td>102</td>\n",
" <td>10</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>11</td>\n",
" <td>46</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I-Drugclass</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I-Drugform</th>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I-Drugname</th>\n",
" <td>19</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I-Finding</th>\n",
" <td>25</td>\n",
" <td>7</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" O B-ADR B-DI ... I-Drugform I-Drugname I-Finding\n",
"O 19494 29 175 ... 0 0 0\n",
"B-ADR 159 135 133 ... 0 0 0\n",
"B-DI 242 21 525 ... 0 0 0\n",
"B-Drugclass 50 1 17 ... 0 0 0\n",
"B-Drugform 98 4 11 ... 0 0 0\n",
"B-Drugname 44 1 16 ... 0 0 0\n",
"B-Finding 56 32 87 ... 0 0 0\n",
"I-ADR 180 51 40 ... 0 0 0\n",
"I-DI 236 17 102 ... 0 0 0\n",
"I-Drugclass 0 0 0 ... 0 0 0\n",
"I-Drugform 3 0 0 ... 0 0 0\n",
"I-Drugname 19 0 0 ... 0 0 0\n",
"I-Finding 25 7 6 ... 0 0 0\n",
"\n",
"[13 rows x 13 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 47
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "cA0jWZwjVbI7",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "4161d7fe-c5e3-4f56-b9d4-52830e14da06"
},
"source": [
"model.save_pretrained('ner_bert.bin')\n",
"tokenizer.save_pretrained('ner_bert.bin')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Configuration saved in ner_bert.bin/config.json\n",
"Model weights saved in ner_bert.bin/pytorch_model.bin\n",
"tokenizer config file saved in ner_bert.bin/tokenizer_config.json\n",
"Special tokens file saved in ner_bert.bin/special_tokens_map.json\n"
],
"name": "stderr"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('ner_bert.bin/tokenizer_config.json',\n",
" 'ner_bert.bin/special_tokens_map.json',\n",
" 'ner_bert.bin/vocab.txt',\n",
" 'ner_bert.bin/added_tokens.json',\n",
" 'ner_bert.bin/tokenizer.json')"
]
},
"metadata": {
"tags": []
},
"execution_count": 48
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "C5yv9hItsJBP"
},
"source": [
"# Применение модели"
]
},
{
"cell_type": "code",
"metadata": {
"id": "p0JHjRKmuv_m"
},
"source": [
"import torch"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"id": "Kp59uTtXZKT4",
"outputId": "6ed0bcf0-7de7-4b93-936b-0562ef0505b4"
},
"source": [
"text = ' '.join(ner_train[8]['tokens'])\n",
"text = ' '.join(ner_test[4]['tokens'])\n",
"text"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"'Охотно применяю его при борьбе с насморком , что в моем случае явление очень частое .'"
]
},
"metadata": {
"tags": []
},
"execution_count": 50
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "6h2hiUylZVmF"
},
"source": [
"import torch"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Yt8EXbDuuB1U",
"outputId": "bf2699a1-c42f-42bb-b00b-8adb7aca1edd"
},
"source": [
"tokens = tokenizer(text, return_tensors='pt')\n",
"tokens = {k: v.to(model.device) for k, v in tokens.items()}\n",
"\n",
"with torch.no_grad():\n",
" pred = model(**tokens)\n",
"pred.logits.shape"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"torch.Size([1, 29, 13])"
]
},
"metadata": {
"tags": []
},
"execution_count": 52
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2GQPlbnyuu6H",
"outputId": "e43272b6-3e22-44bb-ec7b-15a2acf570d3"
},
"source": [
"indices = pred.logits.argmax(dim=-1)[0].cpu().numpy()\n",
"token_text = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])\n",
"for t, idx in zip(token_text, indices):\n",
" print(f'{t:15s} {label_list[idx]:10s}')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"[CLS] O \n",
"О O \n",
"##хо O \n",
"##тно O \n",
"при O \n",
"##мен O \n",
"##я O \n",
"##ю O \n",
"его O \n",
"при O \n",
"борьбе O \n",
"с O \n",
"нас B-DI \n",
"##мор B-DI \n",
"##ком B-DI \n",
", O \n",
"что O \n",
"в O \n",
"м O \n",
"##ое O \n",
"##м O \n",
"случае O \n",
"я O \n",
"##вление O \n",
"очень O \n",
"часто O \n",
"##е O \n",
". O \n",
"[SEP] O \n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "tBSq9enuwJ_V"
},
"source": [
"Более простое применение модели: пайплайн от huggingface"
]
},
{
"cell_type": "code",
"metadata": {
"id": "lnrAoy6b8swA"
},
"source": [
"from transformers import pipeline"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "uowfISMu8v1k"
},
"source": [
"pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average', device=0)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1WjXMXCv9Nde",
"outputId": "e2b4f6d6-6153-4bbd-915d-a3bc4d148895"
},
"source": [
"print(text)\n",
"print(pipe(text))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Охотно применяю его при борьбе с насморком , что в моем случае явление очень частое .\n",
"[{'entity_group': 'DI', 'score': 0.73669535, 'word': 'насморком', 'start': 33, 'end': 42}]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "panjTvbH9PJL"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment