Skip to content

Instantly share code, notes, and snippets.

@avidale
Last active December 11, 2023 04:22
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save avidale/4de1454bf41822dc862fddbd779d4cc6 to your computer and use it in GitHub Desktop.
Save avidale/4de1454bf41822dc862fddbd779d4cc6 to your computer and use it in GitHub Desktop.
finetune_rut5-base-multitask.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "finetune_rut5-base-multitask.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyM06nvRLsFHInOZudgZRJIu",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"f8844d27e6b446b89b88cf3de8ca000e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_ae42bce4e7434690892a2d557e30bd90",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_cc442314241b43838c2dbeef00e8d2c4",
"IPY_MODEL_0a252be55b7a42109f8539ccbad8f0a5"
]
}
},
"ae42bce4e7434690892a2d557e30bd90": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"cc442314241b43838c2dbeef00e8d2c4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_083bbcd1930e440ead4beb8bf00cae63",
"_dom_classes": [],
"description": "Downloading: ",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 2032,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 2032,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_ac2a7c9a58bd4b67b726475f1204d369"
}
},
"0a252be55b7a42109f8539ccbad8f0a5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_0d668a9c1ef84a6f9f5424b3e00f1c2b",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 4.58k/? [00:04<00:00, 1.07kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_e8fa65b8dc0341ccbd2d31a2afdd64d6"
}
},
"083bbcd1930e440ead4beb8bf00cae63": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"ac2a7c9a58bd4b67b726475f1204d369": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"0d668a9c1ef84a6f9f5424b3e00f1c2b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"e8fa65b8dc0341ccbd2d31a2afdd64d6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"627ea129c40f42d7832fffaeb9668552": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_bb82b7e969764fa0bedca12a017734f4",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_7e29af6e8140480bb713daed1b6a8b8a",
"IPY_MODEL_12aac9cd61d4437eaca879c02450531d"
]
}
},
"bb82b7e969764fa0bedca12a017734f4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"7e29af6e8140480bb713daed1b6a8b8a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_09807cf014cc4f22921f66df99710803",
"_dom_classes": [],
"description": "Downloading: ",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1608,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1608,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_fc1939abb4b34011a3b5b2ee43b4aada"
}
},
"12aac9cd61d4437eaca879c02450531d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_9a3f37a1a0e04992883e04ed101b4384",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 10.1k/? [00:00<00:00, 164kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_38a8d6f2af734d00bcaaffb40d7d209c"
}
},
"09807cf014cc4f22921f66df99710803": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"fc1939abb4b34011a3b5b2ee43b4aada": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9a3f37a1a0e04992883e04ed101b4384": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"38a8d6f2af734d00bcaaffb40d7d209c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"59d6a7c59bf84e2c9d7d19e8854e2e59": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_fcf29d5e37204b29bcbd63947da7d2a9",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_2f0b529794bd49b5bfada2ac31a1f028",
"IPY_MODEL_2ce76d15d11b4ac7968491cf97ce2ee8"
]
}
},
"fcf29d5e37204b29bcbd63947da7d2a9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"2f0b529794bd49b5bfada2ac31a1f028": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_78012fea667c43969cb495f5077a118c",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 57356138,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 57356138,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_8ffc760a98154f77b08dc17996ce0588"
}
},
"2ce76d15d11b4ac7968491cf97ce2ee8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_1c11643d8b514589bf8a1a133f2cebde",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 57.4M/57.4M [00:02<00:00, 19.4MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_ae6e87ed8b6449ada514be83037e961e"
}
},
"78012fea667c43969cb495f5077a118c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"8ffc760a98154f77b08dc17996ce0588": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"1c11643d8b514589bf8a1a133f2cebde": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"ae6e87ed8b6449ada514be83037e961e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"91d245baafb64665b332ea8a91270761": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_b3f3077461554e8a89bc8e86e4f40053",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_f6841e41679048498ceca4c10f501e24",
"IPY_MODEL_93819fefe31a4ea392bcccd3a7d4c404"
]
}
},
"b3f3077461554e8a89bc8e86e4f40053": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"f6841e41679048498ceca4c10f501e24": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_0def9d63a9d44160b34af92ceb4f200a",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "info",
"max": 1,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_fc26de5f2d5644839103cab11f624ab8"
}
},
"93819fefe31a4ea392bcccd3a7d4c404": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_0569fed3d4b147c98b4120c67ecb48b3",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 572717/0 [00:15<00:00, 39505.54 examples/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_859e30e5556a4ae099e7b61efdcd9bbe"
}
},
"0def9d63a9d44160b34af92ceb4f200a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"fc26de5f2d5644839103cab11f624ab8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"0569fed3d4b147c98b4120c67ecb48b3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"859e30e5556a4ae099e7b61efdcd9bbe": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"6ba8c5986d774fc6b0b1b18c5b6af50f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_875eb5c6dd114e2a8165692a17d21ee7",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_b8d0d69993d74a38af2aa820fafdd07f",
"IPY_MODEL_e2e4b5bb00874d09848d3fb0f609d573"
]
}
},
"875eb5c6dd114e2a8165692a17d21ee7": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b8d0d69993d74a38af2aa820fafdd07f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_c50e0ae690fc4370af8f539e1767cf33",
"_dom_classes": [],
"description": "Downloading: ",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 3015,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 3015,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_da735a401675421daaf34014fb842e49"
}
},
"e2e4b5bb00874d09848d3fb0f609d573": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_e36d7f34317d4146ab02195455ac5a39",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 7.44k/? [00:00<00:00, 20.1kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_7c3856c1b11a434ebd9f64b0bfbc3a44"
}
},
"c50e0ae690fc4370af8f539e1767cf33": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"da735a401675421daaf34014fb842e49": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"e36d7f34317d4146ab02195455ac5a39": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"7c3856c1b11a434ebd9f64b0bfbc3a44": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"c749b82fbb5e4b418981a9c4d82dd672": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_d59b25a4083e40afbe5e421cc299de45",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_2b29954976594cb3990ec1871c7474f3",
"IPY_MODEL_c94ddfc609d54b4595c50bb572dc8fb2"
]
}
},
"d59b25a4083e40afbe5e421cc299de45": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"2b29954976594cb3990ec1871c7474f3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_0bc504761907452c94b9b98234c1d73e",
"_dom_classes": [],
"description": "Downloading: ",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 4687,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 4687,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_27799ef409b44116851fe9727dfe3280"
}
},
"c94ddfc609d54b4595c50bb572dc8fb2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_cf7d719a669941c397eda28c7f143fde",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 201k/? [00:00<00:00, 3.24MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_844fbf6cc43b4fe0a1485e20a29092b1"
}
},
"0bc504761907452c94b9b98234c1d73e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"27799ef409b44116851fe9727dfe3280": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"cf7d719a669941c397eda28c7f143fde": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"844fbf6cc43b4fe0a1485e20a29092b1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"3c4852397902413895b8a43d0a650d35": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_9171be7445ec4c7c843c57befe4d444a",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_cad93850ebf0427dbd6e8d820660284f",
"IPY_MODEL_a0893a137e3e47c896f6d29b12aa092b"
]
}
},
"9171be7445ec4c7c843c57befe4d444a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"cad93850ebf0427dbd6e8d820660284f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_697533ed02414e5ab5db5c63623b1272",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 32213126,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 32213126,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_eb4b90e7610a4e2aafb4b9dc0f663510"
}
},
"a0893a137e3e47c896f6d29b12aa092b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_592e174a265747768ee569983b10d9b3",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 32.2M/32.2M [00:01<00:00, 18.3MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_aa678f814a4c4359b469ac0c13472b64"
}
},
"697533ed02414e5ab5db5c63623b1272": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"eb4b90e7610a4e2aafb4b9dc0f663510": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"592e174a265747768ee569983b10d9b3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"aa678f814a4c4359b469ac0c13472b64": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"a5f7849f52c64c49bf6547c5ad4bdcbb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_29dc51f266aa4405b91382a4b1f9e035",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_5a6ef5c2db3f402ca4b05f00eaae99e8",
"IPY_MODEL_195c7fabd3e145edb8f6079c8b4bd86f"
]
}
},
"29dc51f266aa4405b91382a4b1f9e035": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5a6ef5c2db3f402ca4b05f00eaae99e8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_1c401c6a60e14c14982b4225dd1c0650",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "info",
"max": 1,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_b70f5c849b354e5faf7ebb514564fa63"
}
},
"195c7fabd3e145edb8f6079c8b4bd86f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_a1ba36964abe49bd9b9118488fc99851",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 251263/0 [00:13<00:00, 20337.27 examples/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_8072067e3ca44623b86e27fdf9dfaa22"
}
},
"1c401c6a60e14c14982b4225dd1c0650": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"b70f5c849b354e5faf7ebb514564fa63": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"a1ba36964abe49bd9b9118488fc99851": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"8072067e3ca44623b86e27fdf9dfaa22": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"ada8592ba4cf4c34898ad441322fb562": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_dabb9c226d95423e8743a1154fc46dc9",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_fc5f1633ac0f4620b26c829b5469fb3c",
"IPY_MODEL_520da3083b19406fa35974e03c91be29"
]
}
},
"dabb9c226d95423e8743a1154fc46dc9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"fc5f1633ac0f4620b26c829b5469fb3c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_fc407af33d964efb8db1b638e6719864",
"_dom_classes": [],
"description": "100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 251263,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 251263,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_5aee130cb56e463e8dd61dfff9a31bb5"
}
},
"520da3083b19406fa35974e03c91be29": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_9c5830fb70f8477baed04891c1f5c15d",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 251263/251263 [01:50<00:00, 2267.59it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_fd80d90c8a504c269ee4b713a584beab"
}
},
"fc407af33d964efb8db1b638e6719864": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"5aee130cb56e463e8dd61dfff9a31bb5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9c5830fb70f8477baed04891c1f5c15d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"fd80d90c8a504c269ee4b713a584beab": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9eb9f30e7786438ebcf30d92b01f8ab9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_73dcb65dabe84f84a76e778b17fa731e",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_b62cc7f2e3aa42b6b1a0346f2c1d9f88",
"IPY_MODEL_599be3cd8b6347c88a709dca14b78540"
]
}
},
"73dcb65dabe84f84a76e778b17fa731e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b62cc7f2e3aa42b6b1a0346f2c1d9f88": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_ab53ae4035db4983a91ff99af7cbe0ca",
"_dom_classes": [],
"description": "100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 616216,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 616216,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_e84f51d728514152965543276403c9bf"
}
},
"599be3cd8b6347c88a709dca14b78540": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_ee5f1b7f0bea49a8bb09d64c348ee53e",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 616216/616216 [00:12<00:00, 48169.99it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_b738256f890f48da8af4c099212aafd1"
}
},
"ab53ae4035db4983a91ff99af7cbe0ca": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"e84f51d728514152965543276403c9bf": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"ee5f1b7f0bea49a8bb09d64c348ee53e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"b738256f890f48da8af4c099212aafd1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9bb8c05692754303b3d8d5cb10afc78e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_c92e13a65d4c4186ae14c095a75277d3",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_151271018a26466cbe38c279f04f3e6c",
"IPY_MODEL_f1d516d33df44344b759e444ceeb62ca"
]
}
},
"c92e13a65d4c4186ae14c095a75277d3": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"151271018a26466cbe38c279f04f3e6c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_fb803c4b595a4a85ae409dfa00565d61",
"_dom_classes": [],
"description": " 4%",
"_model_name": "FloatProgressModel",
"bar_style": "danger",
"max": 251263,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 9985,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_3413c51a743544a8b6bfa7d7893c3aa7"
}
},
"f1d516d33df44344b759e444ceeb62ca": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_6dd6237ec039423ebdde6ea47ad48a12",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 9985/251263 [00:50<12:41, 316.77it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_4ee2fd4a7fcb40cabb13cb81cbd364a6"
}
},
"fb803c4b595a4a85ae409dfa00565d61": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"3413c51a743544a8b6bfa7d7893c3aa7": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"6dd6237ec039423ebdde6ea47ad48a12": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"4ee2fd4a7fcb40cabb13cb81cbd364a6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"0a0dec11e0354f0db5871ed73cacb3f1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_0ee6cc753f584c1785ff982167ad9c04",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_796c08d68cc6443dbc5b4c9100cc7f0a",
"IPY_MODEL_c2fb21304bea46ddb9e582c92231231d"
]
}
},
"0ee6cc753f584c1785ff982167ad9c04": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"796c08d68cc6443dbc5b4c9100cc7f0a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_78877bf42425411485ba4a334ca8f202",
"_dom_classes": [],
"description": "loss: 1.4789641368957018: 0%",
"_model_name": "FloatProgressModel",
"bar_style": "danger",
"max": 50000000,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 75000,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_4e392d8d865f4a61b85c64c7f14bb7fd"
}
},
"c2fb21304bea46ddb9e582c92231231d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_2e8e2326727a4e48b24b4c47c9fb1f61",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 75000/50000000 [2:55:32<1917:25:49, 7.23it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_6a1d007fdb4447a1a8511e4eb450285b"
}
},
"78877bf42425411485ba4a334ca8f202": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"4e392d8d865f4a61b85c64c7f14bb7fd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"2e8e2326727a4e48b24b4c47c9fb1f61": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"6a1d007fdb4447a1a8511e4eb450285b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/avidale/4de1454bf41822dc862fddbd779d4cc6/finetune_rut5-base-multitask.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "IWqqEHTK-_sX"
},
"source": [
"!pip install transformers sentencepiece datasets natasha"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "1ylSX50n_uCx"
},
"source": [
"We finetune our Russian-English T5 model on several tasks:\n",
"* Translation with https://huggingface.co/datasets/opus_wikipedia\n",
"* Paraphrasing with https://huggingface.co/datasets/tapaco\n",
"* Filling the gaps in a text\n",
"* Restoring the text from a noisy bag of words"
]
},
{
"cell_type": "code",
"metadata": {
"id": "WOVgxMvk_Gwt"
},
"source": [
"from transformers import (\n",
" AdamW,\n",
" T5ForConditionalGeneration,\n",
" T5Tokenizer,\n",
" get_linear_schedule_with_warmup\n",
")\n",
"import torch\n",
"from datasets import load_dataset"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EbzZ3S5z-rE2",
"outputId": "2c1b5b41-38c3-4756-8ded-d060f5bf1051"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/gd')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Mounted at /gd\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Lkm1FOM5_RWV"
},
"source": [
"raw_model = 'cointegrated/rut5-base'\n",
"MODEL_NAME = '/gd/MyDrive/models/rut5-base-partial'\n",
"\n",
"if os.path.exists(MODEL_NAME): # continue fine-tuning\n",
" raw_model = MODEL_NAME\n",
"model = T5ForConditionalGeneration.from_pretrained(raw_model)\n",
"tokenizer = T5Tokenizer.from_pretrained(raw_model)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "QiPzHJeS_SGe"
},
"source": [
"### Create the tasks\n",
"\n",
"```\n",
"def task():\n",
" return input_text, output_text\n",
"```"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 257,
"referenced_widgets": [
"f8844d27e6b446b89b88cf3de8ca000e",
"ae42bce4e7434690892a2d557e30bd90",
"cc442314241b43838c2dbeef00e8d2c4",
"0a252be55b7a42109f8539ccbad8f0a5",
"083bbcd1930e440ead4beb8bf00cae63",
"ac2a7c9a58bd4b67b726475f1204d369",
"0d668a9c1ef84a6f9f5424b3e00f1c2b",
"e8fa65b8dc0341ccbd2d31a2afdd64d6",
"627ea129c40f42d7832fffaeb9668552",
"bb82b7e969764fa0bedca12a017734f4",
"7e29af6e8140480bb713daed1b6a8b8a",
"12aac9cd61d4437eaca879c02450531d",
"09807cf014cc4f22921f66df99710803",
"fc1939abb4b34011a3b5b2ee43b4aada",
"9a3f37a1a0e04992883e04ed101b4384",
"38a8d6f2af734d00bcaaffb40d7d209c",
"59d6a7c59bf84e2c9d7d19e8854e2e59",
"fcf29d5e37204b29bcbd63947da7d2a9",
"2f0b529794bd49b5bfada2ac31a1f028",
"2ce76d15d11b4ac7968491cf97ce2ee8",
"78012fea667c43969cb495f5077a118c",
"8ffc760a98154f77b08dc17996ce0588",
"1c11643d8b514589bf8a1a133f2cebde",
"ae6e87ed8b6449ada514be83037e961e",
"91d245baafb64665b332ea8a91270761",
"b3f3077461554e8a89bc8e86e4f40053",
"f6841e41679048498ceca4c10f501e24",
"93819fefe31a4ea392bcccd3a7d4c404",
"0def9d63a9d44160b34af92ceb4f200a",
"fc26de5f2d5644839103cab11f624ab8",
"0569fed3d4b147c98b4120c67ecb48b3",
"859e30e5556a4ae099e7b61efdcd9bbe"
]
},
"id": "xgo3bG1D_Nlv",
"outputId": "edde084a-05bd-45ee-fe27-708c1a64971e"
},
"source": [
"opus_wiki = load_dataset(\"opus_wikipedia\", lang1=\"en\", lang2=\"ru\")\n",
"len(opus_wiki)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f8844d27e6b446b89b88cf3de8ca000e",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2032.0, style=ProgressStyle(description…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "627ea129c40f42d7832fffaeb9668552",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1608.0, style=ProgressStyle(description…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"Using custom data configuration en-ru-lang1=en,lang2=ru\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"\n",
"Downloading and preparing dataset opus_wikipedia/en-ru (download: 54.70 MiB, generated: 159.88 MiB, post-processed: Unknown size, total: 214.58 MiB) to /root/.cache/huggingface/datasets/opus_wikipedia/en-ru-lang1=en,lang2=ru/0.0.0/4a18b1be119afcbc678dac8b8f58888a10016b2ba19ea2ca0adfb4777f0d2b6b...\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "59d6a7c59bf84e2c9d7d19e8854e2e59",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=57356138.0, style=ProgressStyle(descrip…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "91d245baafb64665b332ea8a91270761",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\rDataset opus_wikipedia downloaded and prepared to /root/.cache/huggingface/datasets/opus_wikipedia/en-ru-lang1=en,lang2=ru/0.0.0/4a18b1be119afcbc678dac8b8f58888a10016b2ba19ea2ca0adfb4777f0d2b6b. Subsequent calls will reuse this data.\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"1"
]
},
"metadata": {
"tags": []
},
"execution_count": 18
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8RokDxMJ_ihA",
"outputId": "57b95b8f-f0d2-4af3-8ee7-98b7c779d48a"
},
"source": [
"print(len(opus_wiki['train']))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"572717\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1cBa_UWN_pJz",
"outputId": "2ae777f8-54c6-4a6d-8531-95733ed0b202"
},
"source": [
"import random\n",
"random.choice(opus_wiki['train'])"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'id': '255231',\n",
" 'translation': {'en': 'The air-delivery system provides 50% more fresh air than is required by New York City Building Code, and a number of recycling chutes serve the entire building.',\n",
" 'ru': 'Система подачи воздуха предоставляет на 50 % больше свежего воздуха, чем требуется строительным кодексом Нью-Йорка, а несколько мусоросбросов обслуживают все здание.'}}"
]
},
"metadata": {
"tags": []
},
"execution_count": 20
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0UQtiALxDnmr",
"outputId": "18aa844d-840c-49ba-983a-bda814955f1b"
},
"source": [
"def translate_task():\n",
" item = random.choice(opus_wiki['train'])['translation']\n",
" if random.random() < 0.5:\n",
" return f'translate ru-en | {item[\"ru\"]}', item[\"en\"]\n",
" else:\n",
" return f'translate en-ru | {item[\"en\"]}', item[\"ru\"]\n",
"\n",
"translate_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('translate ru-en | World Service не получает финансирование для радиопередач к Великобритании, и надежный средний прием волны был возможен на только на юго-востоке Англии на 648 kHz — и оно прекратилось в 2011 году, из-за снижения расходов.',\n",
" '===UK===The BBC World Service does not receive funding for broadcasts to the UK, and reliable medium wave reception was possible in only southeast of England from the 648 kHz service which ceased in 2011 as a cost-cutting measure.')"
]
},
"metadata": {
"tags": []
},
"execution_count": 21
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Le6oPDor_4qx"
},
"source": [
"# mlsum = load_dataset(\"mlsum\", 'ru')\n",
"# print(len(mlsum['train'])) # 25K"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "hMP5zlIUAbM3"
},
"source": [
"# random.choice(mlsum['train'])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "6gxT-ikm_AZa"
},
"source": [
"# pd.Series([len(tokenizer.tokenize(random.choice(mlsum['train'])['text'])) for _ in range(1000)]).quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1])\n",
"# 50% - 800, 75% 2K, 95% - 5K"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "tpsJx6muEJxD"
},
"source": [
"Just ignore this task, texts are too long"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 222,
"referenced_widgets": [
"6ba8c5986d774fc6b0b1b18c5b6af50f",
"875eb5c6dd114e2a8165692a17d21ee7",
"b8d0d69993d74a38af2aa820fafdd07f",
"e2e4b5bb00874d09848d3fb0f609d573",
"c50e0ae690fc4370af8f539e1767cf33",
"da735a401675421daaf34014fb842e49",
"e36d7f34317d4146ab02195455ac5a39",
"7c3856c1b11a434ebd9f64b0bfbc3a44",
"c749b82fbb5e4b418981a9c4d82dd672",
"d59b25a4083e40afbe5e421cc299de45",
"2b29954976594cb3990ec1871c7474f3",
"c94ddfc609d54b4595c50bb572dc8fb2",
"0bc504761907452c94b9b98234c1d73e",
"27799ef409b44116851fe9727dfe3280",
"cf7d719a669941c397eda28c7f143fde",
"844fbf6cc43b4fe0a1485e20a29092b1",
"3c4852397902413895b8a43d0a650d35",
"9171be7445ec4c7c843c57befe4d444a",
"cad93850ebf0427dbd6e8d820660284f",
"a0893a137e3e47c896f6d29b12aa092b",
"697533ed02414e5ab5db5c63623b1272",
"eb4b90e7610a4e2aafb4b9dc0f663510",
"592e174a265747768ee569983b10d9b3",
"aa678f814a4c4359b469ac0c13472b64",
"a5f7849f52c64c49bf6547c5ad4bdcbb",
"29dc51f266aa4405b91382a4b1f9e035",
"5a6ef5c2db3f402ca4b05f00eaae99e8",
"195c7fabd3e145edb8f6079c8b4bd86f",
"1c401c6a60e14c14982b4225dd1c0650",
"b70f5c849b354e5faf7ebb514564fa63",
"a1ba36964abe49bd9b9118488fc99851",
"8072067e3ca44623b86e27fdf9dfaa22"
]
},
"id": "Sb1P37ViAfbk",
"outputId": "22807b7d-58df-4a65-dd95-ed0e8d7eb80d"
},
"source": [
"tapaco = load_dataset('tapaco', 'ru')"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6ba8c5986d774fc6b0b1b18c5b6af50f",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3015.0, style=ProgressStyle(description…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c749b82fbb5e4b418981a9c4d82dd672",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4687.0, style=ProgressStyle(description…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n",
"Downloading and preparing dataset tapaco/ru (download: 30.72 MiB, generated: 23.40 MiB, post-processed: Unknown size, total: 54.12 MiB) to /root/.cache/huggingface/datasets/tapaco/ru/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698...\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3c4852397902413895b8a43d0a650d35",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=32213126.0, style=ProgressStyle(descrip…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a5f7849f52c64c49bf6547c5ad4bdcbb",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\rDataset tapaco downloaded and prepared to /root/.cache/huggingface/datasets/tapaco/ru/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698. Subsequent calls will reuse this data.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YUeY-qqqBIVx",
"outputId": "44c87a9d-88c4-4400-c390-8c7e76977da5"
},
"source": [
"tapaco"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['language', 'lists', 'paraphrase', 'paraphrase_set_id', 'sentence_id', 'tags'],\n",
" num_rows: 251263\n",
" })\n",
"})"
]
},
"metadata": {
"tags": []
},
"execution_count": 26
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 67,
"referenced_widgets": [
"ada8592ba4cf4c34898ad441322fb562",
"dabb9c226d95423e8743a1154fc46dc9",
"fc5f1633ac0f4620b26c829b5469fb3c",
"520da3083b19406fa35974e03c91be29",
"fc407af33d964efb8db1b638e6719864",
"5aee130cb56e463e8dd61dfff9a31bb5",
"9c5830fb70f8477baed04891c1f5c15d",
"fd80d90c8a504c269ee4b713a584beab"
]
},
"id": "rj77kSJUBlEL",
"outputId": "8995f011-8734-4a8c-c460-f2f020ca1f8f"
},
"source": [
"from collections import Counter, defaultdict\n",
"from tqdm.auto import tqdm, trange\n",
"cnt = Counter()\n",
"p2s = defaultdict(list)\n",
"for i, e in enumerate(tqdm(tapaco['train'])):\n",
" cnt[e['paraphrase_set_id']] += 1\n",
" p2s[e['paraphrase_set_id']].append(i)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ada8592ba4cf4c34898ad441322fb562",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=251263.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "aPsdzDn8B1ht"
},
"source": [
"ph = random.choice(tapaco['train'])['paraphrase_set_id']"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "NYPJK3rhEPzb",
"outputId": "9a157408-29bc-4ba4-a681-096bb16d63b1"
},
"source": [
"def paraphrase_task():\n",
" ph = random.choice(tapaco['train'])['paraphrase_set_id']\n",
" texts = tapaco['train'][p2s[ph]]['paraphrase']\n",
" random.shuffle(texts)\n",
" return f'paraphrase | {texts[0]}', texts[1]\n",
"\n",
"paraphrase_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('paraphrase | Том крайне наивен.', 'Том крайне доверчив.')"
]
},
"metadata": {
"tags": []
},
"execution_count": 29
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sNjEe2tjoTN-"
},
"source": [
"Use a large Russian corpus - take a file from https://wortschatz.uni-leipzig.de/en/download/Russian"
]
},
{
"cell_type": "code",
"metadata": {
"id": "gVRNgVc0oVnI",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "4d433a87-4f87-4e07-9095-0c6f5aca0415"
},
"source": [
"!wget http://pcai056.informatik.uni-leipzig.de/downloads/corpora/rus-ru_web-public_2019_1M.tar.gz"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"--2021-06-10 21:01:08-- http://pcai056.informatik.uni-leipzig.de/downloads/corpora/rus-ru_web-public_2019_1M.tar.gz\n",
"Resolving pcai056.informatik.uni-leipzig.de (pcai056.informatik.uni-leipzig.de)... 139.18.2.216\n",
"Connecting to pcai056.informatik.uni-leipzig.de (pcai056.informatik.uni-leipzig.de)|139.18.2.216|:80... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 206133725 (197M) [application/x-gzip]\n",
"Saving to: ‘rus-ru_web-public_2019_1M.tar.gz’\n",
"\n",
"rus-ru_web-public_2 100%[===================>] 196.58M 68.6MB/s in 2.9s \n",
"\n",
"2021-06-10 21:01:11 (68.6 MB/s) - ‘rus-ru_web-public_2019_1M.tar.gz’ saved [206133725/206133725]\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "sxRSwWbMomu8",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "9a924a0b-efa3-4cdb-e085-7df1fd73fde4"
},
"source": [
"!tar -xsvf rus-ru_web-public_2019_1M.tar.gz"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"rus-ru_web-public_2019_1M/\n",
"rus-ru_web-public_2019_1M/rus-ru_web-public_2019_1M-inv_so.txt\n",
"rus-ru_web-public_2019_1M/rus-ru_web-public_2019_1M-import.sql\n",
"rus-ru_web-public_2019_1M/rus-ru_web-public_2019_1M-co_s.txt\n",
"rus-ru_web-public_2019_1M/rus-ru_web-public_2019_1M-sentences.txt\n",
"rus-ru_web-public_2019_1M/rus-ru_web-public_2019_1M-sources.txt\n",
"rus-ru_web-public_2019_1M/rus-ru_web-public_2019_1M-co_n.txt\n",
"rus-ru_web-public_2019_1M/rus-ru_web-public_2019_1M-words.txt\n",
"rus-ru_web-public_2019_1M/rus-ru_web-public_2019_1M-inv_w.txt\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 135
},
"id": "fnGpCba8oiNi",
"outputId": "b086f55f-c4d2-466f-fdd4-d8dc188156a6"
},
"source": [
"import pandas as pd\n",
"import csv\n",
"fname = 'rus-ru_web-public_2019_1M/rus-ru_web-public_2019_1M-sentences.txt'\n",
"df_leipzig = pd.read_csv(fname, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
"df_leipzig.columns = ['idx', 'text']\n",
"df_leipzig.sample(3)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>idx</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>361939</th>\n",
" <td>361940</td>\n",
" <td>Как же сразу не заметна пасмурная погода и нак...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96707</th>\n",
" <td>96708</td>\n",
" <td>Владимир Абрамов: «Великих тренеров – Бышовца,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>929674</th>\n",
" <td>929675</td>\n",
" <td>Церковь нажимала всё сильней, а местные всё не...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" idx text\n",
"361939 361940 Как же сразу не заметна пасмурная погода и нак...\n",
"96707 96708 Владимир Абрамов: «Великих тренеров – Бышовца,...\n",
"929674 929675 Церковь нажимала всё сильней, а местные всё не..."
]
},
"metadata": {
"tags": []
},
"execution_count": 32
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GaT4RjunEhj7",
"outputId": "a3b94204-54da-4f12-d6be-c447fd7c75c1"
},
"source": [
"def fill_gap_task():\n",
" text = random.choice(df_leipzig.text) #random.choice(tapaco['train'])['paraphrase']\n",
" words = text.split()\n",
" if len(words) < 3:\n",
" return fill_gap_task()\n",
" right_id = random.randint(1, len(words)-2)\n",
" left_id = random.randint(1, right_id)\n",
" if random.random() < 0.5:\n",
" filler = ['___']\n",
" else:\n",
" filler = [f'_{right_id-left_id+1}_']\n",
" lhs = ' '.join(['fill |'] + words[:left_id] + filler + words[right_id+1:])\n",
" rhs = ' '.join(words[left_id:(right_id+1)])\n",
" return lhs, rhs\n",
"\n",
"fill_gap_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('fill | Yerli: ___ на полигональном уровне, но некоторые формы пиксельных шейдеров, включая текстурирование останутся.',\n",
" 'В общем-то все детали поверхностей рассчитываются')"
]
},
"metadata": {
"tags": []
},
"execution_count": 33
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EyyhL_HkTDSw"
},
"source": [
"#### Simplification"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7Ug_5GZ-nLDi"
},
"source": [
"The WikiLarge corpus, translated to Russian (from https://github.com/dialogue-evaluation/RuSimpleSentEval), filetered by lenght and ngram similarity."
]
},
{
"cell_type": "code",
"metadata": {
"id": "qJ0F_yRpTW-u"
},
"source": [
"import pandas as pd"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "AIwM5np9TGCf"
},
"source": [
"simple_filtered = pd.read_csv('/gd/MyDrive/datasets/wiki_simple_ru_filtered.tsv', sep='\\t')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7t19S656TP4n",
"outputId": "e3b23ce6-e816-4513-bb7e-d0207776eaf6"
},
"source": [
"def simplify_task(en=0.3):\n",
" row = simple_filtered.sample(1).iloc[0]\n",
" if en is True or isinstance(en, float) and random.random() < en:\n",
" x, y = row.src, row.dst\n",
" else:\n",
" x, y = row.target_x, row.target_y\n",
"\n",
" return f'simplify | {x}', y\n",
"\n",
"simplify_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('simplify | Уидон, Светлячок: полная серия: комментарий к \"Train Job\", дорожка 10 Он имеет тенденцию вести себя как \"ламокс\", который думает, что он самый умный человек в космосе, но время от времени сквозь этот фасад просматриваются намеки на разум, создается впечатление, что он действует глупее, чем он есть.',\n",
" 'Уидон, Светлячок: вся серия: комментарий к «Train Job», трек 10. Он думает, что он самый умный парень в космосе, но он прямо противоположный.')"
]
},
"metadata": {
"tags": []
},
"execution_count": 36
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "izqxBwHctCBd"
},
"source": [
"### Summarization"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ulFu0n4ptH3R"
},
"source": [
"https://github.com/IlyaGusev/gazeta"
]
},
{
"cell_type": "code",
"metadata": {
"id": "gtSgdeDDtEgl"
},
"source": [
"# ! wget https://www.dropbox.com/s/43l702z5a5i2w8j/gazeta_train.jsonl"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "wFVbbwXMtRcG"
},
"source": [
"#import json\n",
"#gazeta = []\n",
"#with open('gazeta_train.jsonl', 'r') as f:\n",
"# for line in tqdm(f.readlines()):\n",
"# gazeta.append(json.loads(line))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "WIqP2XTRtsBg"
},
"source": [
"#random.choice(gazeta)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "s4rq3wJtt1eD"
},
"source": [
"# pd.Series([len(random.choice(gazeta)['text']) for _ in range(1000)]).quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "L1En72GRuIsu"
},
"source": [
"# pd.Series([len(tokenizer.tokenize(random.choice(gazeta)['text'])) for _ in range(1000)]).quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "zUcHomuIukoM"
},
"source": [
"#gazeta_short = [g for g in gazeta if len(g['text']) <= 5000]\n",
"#print(len(gazeta)) # 52400\n",
"#print(len(gazeta_short)) # 37980"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "t0ufRPz1vqu_"
},
"source": [
"#pd.Series([len(tokenizer.tokenize(random.choice(gazeta_short)['text'])) for _ in range(1000)]).quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Ji5J94V0u7wE"
},
"source": [
"#def sumarize_task():\n",
"# row = random.choice(gazeta_short)\n",
"# return f'summarize | {row[\"text\"]}', row[\"summary\"]\n",
"#\n",
"#sumarize_task()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "aFwQ3a4m_mkt"
},
"source": [
"Training data for https://huggingface.co/IlyaGusev/rubert_telegram_headlines"
]
},
{
"cell_type": "code",
"metadata": {
"id": "jG4rby0L-hrP",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "c1f1bcbe-a3be-4a55-dcf1-83221a113932"
},
"source": [
"!wget https://www.dropbox.com/s/ykqk49a8avlmnaf/ru_all_split.tar.gz"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"--2021-06-10 21:01:22-- https://www.dropbox.com/s/ykqk49a8avlmnaf/ru_all_split.tar.gz\n",
"Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.18, 2620:100:6021:18::a27d:4112\n",
"Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.18|:443... connected.\n",
"HTTP request sent, awaiting response... 301 Moved Permanently\n",
"Location: /s/raw/ykqk49a8avlmnaf/ru_all_split.tar.gz [following]\n",
"--2021-06-10 21:01:22-- https://www.dropbox.com/s/raw/ykqk49a8avlmnaf/ru_all_split.tar.gz\n",
"Reusing existing connection to www.dropbox.com:443.\n",
"HTTP request sent, awaiting response... 302 Found\n",
"Location: https://uc314301305ae7964b78b0a38c5d.dl.dropboxusercontent.com/cd/0/inline/BQKrAXoOeirTxLMhZUDFXB_b1sX76cFkcHWwHVC9WYfLsCuD-f1CVFlKQvaCHhEhJvdX2NYPNQfqXZf_6pu2cyx9XW-qgQ-53lpcoyLrXxZ1rNWHSoNCMtP5y5JV2CETR9TYnUPC-TgQBsekqUh_OUY9/file# [following]\n",
"--2021-06-10 21:01:22-- https://uc314301305ae7964b78b0a38c5d.dl.dropboxusercontent.com/cd/0/inline/BQKrAXoOeirTxLMhZUDFXB_b1sX76cFkcHWwHVC9WYfLsCuD-f1CVFlKQvaCHhEhJvdX2NYPNQfqXZf_6pu2cyx9XW-qgQ-53lpcoyLrXxZ1rNWHSoNCMtP5y5JV2CETR9TYnUPC-TgQBsekqUh_OUY9/file\n",
"Resolving uc314301305ae7964b78b0a38c5d.dl.dropboxusercontent.com (uc314301305ae7964b78b0a38c5d.dl.dropboxusercontent.com)... 162.125.65.15, 2620:100:6027:15::a27d:480f\n",
"Connecting to uc314301305ae7964b78b0a38c5d.dl.dropboxusercontent.com (uc314301305ae7964b78b0a38c5d.dl.dropboxusercontent.com)|162.125.65.15|:443... connected.\n",
"HTTP request sent, awaiting response... 302 Found\n",
"Location: /cd/0/inline2/BQJ_5gIfyenc_rdXq8RZozsJi-nVy1ZZa2_jj8UbF1EqU64Esuk-2u4LBoVYj0ZMnM4tceYXA_muf8G8L4XjBPKyKw7zFbLGuozsPUPKEB_2UF8w7f1sLDPz9PgnZSeCiVbTftRXLRtP_vVMFBRHuBt1Kj0CoJIZ0Yp8KJYz5SqP-CYohorannGafNHyYIxKq7aosbcatGnmFtSyK2aSqUuALgPidLGwZ54OiS4TP5q-e5sb8PkbrhdVpuTPKZ34uETjfwTNTc7bIx6zorMOjZTiSX9WIK7Dezmgq_1SCTf6i3KW1b8BH_20FXwvQOQSGMWgJ_i9xwvHbm4JetCiienPr6wrO98U6W7FEHFsQWu6XSCxTb3LxgfW7Nk25NHc1UQ/file [following]\n",
"--2021-06-10 21:01:23-- https://uc314301305ae7964b78b0a38c5d.dl.dropboxusercontent.com/cd/0/inline2/BQJ_5gIfyenc_rdXq8RZozsJi-nVy1ZZa2_jj8UbF1EqU64Esuk-2u4LBoVYj0ZMnM4tceYXA_muf8G8L4XjBPKyKw7zFbLGuozsPUPKEB_2UF8w7f1sLDPz9PgnZSeCiVbTftRXLRtP_vVMFBRHuBt1Kj0CoJIZ0Yp8KJYz5SqP-CYohorannGafNHyYIxKq7aosbcatGnmFtSyK2aSqUuALgPidLGwZ54OiS4TP5q-e5sb8PkbrhdVpuTPKZ34uETjfwTNTc7bIx6zorMOjZTiSX9WIK7Dezmgq_1SCTf6i3KW1b8BH_20FXwvQOQSGMWgJ_i9xwvHbm4JetCiienPr6wrO98U6W7FEHFsQWu6XSCxTb3LxgfW7Nk25NHc1UQ/file\n",
"Reusing existing connection to uc314301305ae7964b78b0a38c5d.dl.dropboxusercontent.com:443.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 575928675 (549M) [application/octet-stream]\n",
"Saving to: ‘ru_all_split.tar.gz.1’\n",
"\n",
"ru_all_split.tar.gz 100%[===================>] 549.25M 20.7MB/s in 31s \n",
"\n",
"2021-06-10 21:01:55 (17.8 MB/s) - ‘ru_all_split.tar.gz.1’ saved [575928675/575928675]\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Yg0XVfWh-jKJ",
"outputId": "fa02d4fd-a80f-4786-dd50-308430f05350"
},
"source": [
"!tar -xvzf ru_all_split.tar.gz"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'\n",
"tar: Ignoring unknown extended header keyword 'SCHILY.dev'\n",
"tar: Ignoring unknown extended header keyword 'SCHILY.ino'\n",
"tar: Ignoring unknown extended header keyword 'SCHILY.nlink'\n",
"ru_all_train.jsonl\n",
"tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'\n",
"tar: Ignoring unknown extended header keyword 'SCHILY.dev'\n",
"tar: Ignoring unknown extended header keyword 'SCHILY.ino'\n",
"tar: Ignoring unknown extended header keyword 'SCHILY.nlink'\n",
"ru_all_val.jsonl\n",
"tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'\n",
"tar: Ignoring unknown extended header keyword 'SCHILY.dev'\n",
"tar: Ignoring unknown extended header keyword 'SCHILY.ino'\n",
"tar: Ignoring unknown extended header keyword 'SCHILY.nlink'\n",
"ru_all_test.jsonl\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 67,
"referenced_widgets": [
"9eb9f30e7786438ebcf30d92b01f8ab9",
"73dcb65dabe84f84a76e778b17fa731e",
"b62cc7f2e3aa42b6b1a0346f2c1d9f88",
"599be3cd8b6347c88a709dca14b78540",
"ab53ae4035db4983a91ff99af7cbe0ca",
"e84f51d728514152965543276403c9bf",
"ee5f1b7f0bea49a8bb09d64c348ee53e",
"b738256f890f48da8af4c099212aafd1"
]
},
"id": "bhUuS0UJ_ujm",
"outputId": "7bf3ef1b-a74b-4d09-e4b7-0936cfefd8c2"
},
"source": [
"import json\n",
"tg = []\n",
"with open('ru_all_train.jsonl', 'r') as f:\n",
" for line in tqdm(f.readlines()):\n",
" tg.append(json.loads(line))"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9eb9f30e7786438ebcf30d92b01f8ab9",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=616216.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_Ii3GyH9_yIP",
"outputId": "43a79cf7-4fb2-4f45-a1fb-c183fe59b932"
},
"source": [
"random.choice(tg)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'text': 'Около 400 очагов новой коронавирусной инфекции возникло в больницах РФ, сообщили в Министерстве здравоохранения России . Как рассказал на заседании Госдумы глава ведомства Михаил Мурашко , россияне небезосновательно опасаются посещать медучреждения. \"Пациенты опасаются — в общем, обоснованно опасаются — в этот период идти на большое вмешательство, я уже говорил о том, что 400 очагов возникло в больницах, заносы, несмотря на все профилактические меры, существуют\", — сказал Мурашко (цитата по ТАСС ). Министр также напомнил, что из-за пандемии коронавируса объёмы оказания плановой медицинской помощи снизились во всём мире. Ранее \"ДП\" писал о том, что в Петербурге, по последним данным, коронавирусом заразились почти 1,5 тыс. медработников. При этом общее число заболевших в городе 13 мая приблизилось к отметке в 8,5 тыс. человек. Выделите фрагмент с текстом ошибки и нажмите Ctrl+Enter',\n",
" 'timestamp': 1589380140,\n",
" 'title': 'Минздрав сообщил о 400 очагах коронавируса в больницах России',\n",
" 'url': 'https://www.dp.ru/a/2020/05/13/Minzdrav_soobshhil_o_400_o'}"
]
},
"metadata": {
"tags": []
},
"execution_count": 48
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LvMfqTNRASc9",
"outputId": "ee60cb91-c4aa-41dd-9623-aafe3e2a9112"
},
"source": [
"pd.Series([len(random.choice(tg)['text']) for _ in range(1000)]).quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1])"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.25 731.75\n",
"0.50 1053.00\n",
"0.75 1582.00\n",
"0.90 2543.20\n",
"0.95 3631.20\n",
"0.99 7760.96\n",
"1.00 13118.00\n",
"dtype: float64"
]
},
"metadata": {
"tags": []
},
"execution_count": 49
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Qb4dgUHD_5QS",
"outputId": "06f5fc2c-a015-43ca-84fc-3e026412ee06"
},
"source": [
"pd.Series([len(tokenizer.tokenize(random.choice(tg)['text'])) for _ in range(1000)]).quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1])"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.25 208.00\n",
"0.50 304.50\n",
"0.75 474.00\n",
"0.90 731.50\n",
"0.95 959.15\n",
"0.99 2306.97\n",
"1.00 9433.00\n",
"dtype: float64"
]
},
"metadata": {
"tags": []
},
"execution_count": 50
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vllGR2b9AEMe",
"outputId": "baeee238-5b5e-40ca-8878-e7d17e651f02"
},
"source": [
"def headline_task():\n",
" row = random.choice(tg)\n",
" if len(row[\"text\"]) > 3000:\n",
" return headline_task()\n",
" return f'headline | {row[\"text\"]}', row[\"title\"]\n",
"\n",
"headline_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('headline | В квартальном отчете \"Газпрома\" отметили, что НАК \"Нафтогаз Украины\" подала ходатайство в суд Латвии о приведении в исполнение решения Стокгольмского арбитража по делу против российского монополиста, передает \" Укринформ \". \"5 ноября 2019 года ПАО \"Газпром\" стало известно, что НАК \"Нафтогаз Украины\" подала ходатайство в суд Видземского пригорода г. Рига (Латвия) о признании и приведении в исполнение на территории Латвии решения Стокгольмского арбитража по транзитному спору от 28 февраля 2018 года, а также о принятии обеспечительных мер\", – отметили в сообщении. В \"Газпроме\" подчеркнули, что заседание суда по этому ходатайству запланировано на конец апреля 2020 года. \"Газпром\" изучает возможности по защите своих интересов',\n",
" '\"Нафтогаз\" подал ходатайство об аресте активов \"Газпрома\" в Латвии – подробности')"
]
},
"metadata": {
"tags": []
},
"execution_count": 51
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-r1zJGVCIdm_"
},
"source": [
"## Conversatons and answers"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "QQlko9pHm9Sz"
},
"source": [
"Dialogues from fiction, collected in https://github.com/Koziev/NLP_Datasets"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "H2e5xutgIgZt",
"outputId": "f65e48ec-0fc1-4f92-c2a2-f52afadcf83f"
},
"source": [
"!wget https://raw.githubusercontent.com/Koziev/NLP_Datasets/master/Conversations/Data/ru.conversations.txt"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"--2021-06-10 21:02:27-- https://raw.githubusercontent.com/Koziev/NLP_Datasets/master/Conversations/Data/ru.conversations.txt\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 9718314 (9.3M) [text/plain]\n",
"Saving to: ‘ru.conversations.txt’\n",
"\n",
"ru.conversations.tx 100%[===================>] 9.27M 49.3MB/s in 0.2s \n",
"\n",
"2021-06-10 21:02:28 (49.3 MB/s) - ‘ru.conversations.txt’ saved [9718314/9718314]\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "l2g7p0huIidp",
"outputId": "7ee9890d-ea67-45f9-9a5e-65c58be70986"
},
"source": [
"with open('ru.conversations.txt', 'r') as f:\n",
" blobs = f.read().split('\\n\\n')\n",
"print(len(blobs))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"84921\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7qIeIJPDIjuq",
"outputId": "daa68287-61bc-488c-d289-eabadd3cc485"
},
"source": [
"def reply_task():\n",
" b = random.choice(blobs)\n",
" phrases = b[2:].split('\\n- ')\n",
" if len(phrases) < 2:\n",
" return reply_task()\n",
" split_point = random.randint(1, len(phrases)-1)\n",
" prefix = '\\n\\n'.join(phrases[:split_point])\n",
" return f'reply | {prefix}', phrases[split_point]\n",
"\n",
"reply_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('reply | Давно бы так!', 'Товарищ, вы будете сейчас выходить?')"
]
},
"metadata": {
"tags": []
},
"execution_count": 54
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Yp6-ej0cm5XK"
},
"source": [
"Samples from otvet.mail.ru"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Yw2Wd-J9S_Nl"
},
"source": [
"import pandas as pd\n",
"mailru_df = pd.DataFrame(pd.read_pickle('/gd/MyDrive/datasets/nlp/mailru.random100k_from_first_1500k.pkl'))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GDXWEDE8TBkg",
"outputId": "d3cb5ec9-b189-44f3-997b-82be0bc2144d"
},
"source": [
"def answer_task():\n",
" row = mailru_df.sample(1).iloc[0]\n",
" return f'answer | {row.q}', row.a\n",
"\n",
"answer_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('answer | Она ангел', 'Все мы тут ангелы :)')"
]
},
"metadata": {
"tags": []
},
"execution_count": 56
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BSkZNnqgm3UH"
},
"source": [
"quizes"
]
},
{
"cell_type": "code",
"metadata": {
"id": "B-Dy8gFQm4tL"
},
"source": [
"import pandas as pd\n",
"quiz_df = pd.DataFrame(pd.read_csv('/gd/MyDrive/datasets/nlp/quiz.tsv', sep='\\t'))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "52cqa6CfnGQK",
"outputId": "febb2804-8267-4da8-e8e3-841aad095497"
},
"source": [
"def quiz_task():\n",
" row = quiz_df.sample(1).iloc[0]\n",
" return f'quiz | {row.q}', row.a\n",
"\n",
"quiz_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('quiz | Задорнизмы. “Знаете ли вы, что чиновники очень любят смотреть на морской прибой с берега: [...три слова пропущено...]!”',\n",
" 'откат за откатом')"
]
},
"metadata": {
"tags": []
},
"execution_count": 58
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_hLeIielIPnv"
},
"source": [
"### A special Russian text normalization task"
]
},
{
"cell_type": "code",
"metadata": {
"id": "eq0Bj6g2ISJz"
},
"source": [
"from natasha import (\n",
" Segmenter,\n",
" MorphVocab,\n",
" NewsEmbedding,\n",
" NewsMorphTagger,\n",
" NewsSyntaxParser,\n",
" NewsNERTagger,\n",
" PER,\n",
" NamesExtractor,\n",
" Doc\n",
")\n",
"segmenter = Segmenter()\n",
"morph_vocab = MorphVocab()\n",
"\n",
"emb = NewsEmbedding()\n",
"morph_tagger = NewsMorphTagger(emb)\n",
"syntax_parser = NewsSyntaxParser(emb)\n",
"\n",
"\n",
"def text2doc(text):\n",
" doc = Doc(text)\n",
" doc.segment(segmenter)\n",
" doc.tag_morph(morph_tagger)\n",
" for token in doc.tokens:\n",
" token.lemmatize(morph_vocab)\n",
" return doc"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "yNA_A5DkIaq6"
},
"source": [
"from pymorphy2 import MorphAnalyzer\n",
"anl = MorphAnalyzer()\n",
"\n",
"from natasha.morph.vocab import OC_UD_INDEX, OC_UD_FEATS, OC_UD_POS\n",
"from natasha.norm import normal_pos\n",
"fvalue_pm_dict = {}\n",
"nat2pm = {}\n",
"\n",
"for fname, fvalue_pm, fvalue_nat in OC_UD_FEATS:\n",
" fvalue_pm_dict[fvalue_pm.lower()] = (fname, fvalue_nat)\n",
" nat2pm[(fname, fvalue_nat)] = fvalue_pm.lower()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 49,
"referenced_widgets": [
"9bb8c05692754303b3d8d5cb10afc78e",
"c92e13a65d4c4186ae14c095a75277d3",
"151271018a26466cbe38c279f04f3e6c",
"f1d516d33df44344b759e444ceeb62ca",
"fb803c4b595a4a85ae409dfa00565d61",
"3413c51a743544a8b6bfa7d7893c3aa7",
"6dd6237ec039423ebdde6ea47ad48a12",
"4ee2fd4a7fcb40cabb13cb81cbd364a6"
]
},
"id": "Nn-gWIKzIgF2",
"outputId": "21ad8602-339d-4d16-86cc-4ebfdf5beb43"
},
"source": [
"from collections import Counter, defaultdict\n",
"\n",
"feature_counter = defaultdict(Counter)\n",
"pos_counter = Counter()\n",
"\n",
"IMPORTANT_POS = {'NOUN', 'PROPN', 'VERB', 'ADJ', 'DET', 'PRON', 'PART', 'ADV', 'NUM', 'X', 'INTJ'}\n",
"HELPER_POS = {'ADP', 'PROPN', 'PUNCT', 'SCONJ', 'CCONJ', 'AUX', 'SYM'}\n",
"\n",
"for i, item in enumerate(tqdm(tapaco['train'])):\n",
" if i > 10000:\n",
" break\n",
" doc = text2doc(item['paraphrase'])\n",
" for token in doc.tokens:\n",
" for k, v in token.feats.items():\n",
" feature_counter[k][v] += 1\n",
" pos_counter[token.pos] += 1\n",
"\n",
" if token.pos not in IMPORTANT_POS.union(HELPER_POS):\n",
" print(token.text)\n",
" print(token.pos)\n",
" print()"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9bb8c05692754303b3d8d5cb10afc78e",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=251263.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "3I2MJciAI6uE"
},
"source": [
"def find_best_parse(token, parses):\n",
" feats = {(k, v) for k, v in token.feats.items()}\n",
" #print(feats)\n",
" scores = []\n",
" for p in parses:\n",
" cand_feats = set()\n",
" score = 1\n",
" for g in p.tag.grammemes:\n",
" if g in OC_UD_POS:\n",
" if str(g) == normal_pos(token.pos) or OC_UD_POS[g] == normal_pos(token.pos):\n",
" score -= 1\n",
" elif g in fvalue_pm_dict:\n",
" cand_feats.add(fvalue_pm_dict[g])\n",
" elif g in {'Sgtm', 'Geox', 'Name', 'intr', 'tran', 'intg', 'UNKN'}:\n",
" continue\n",
" score = len(feats.difference(cand_feats)) + len(cand_feats.difference(feats)) - p.score\n",
" scores.append(score)\n",
" #print(cand_feats, score)\n",
" return parses[np.argmin(scores)]"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "_rB5iQltI-Jo"
},
"source": [
"import random\n",
"\n",
"def reinflect(token, p_lemma=0.5) -> str:\n",
" \"\"\" Try to reinflect a token into something random \"\"\"\n",
" if token.pos == 'PUNCT':\n",
" return token.text\n",
" if token.lemma and random.random() < p_lemma:\n",
" return token.lemma\n",
" parses = anl.parse(token.text)\n",
" bp = find_best_parse(token, parses)\n",
" if not bp:\n",
" return token.lemma or token.text\n",
" new_feats = []\n",
" for k, v in token.feats.items():\n",
" if k in {'Animacy', 'Aspect'}:\n",
" continue\n",
" if bp.tag.POS in {'NOUN'} and k in {'Gender'}:\n",
" continue\n",
" if 'Sgtm' in bp.tag and k == 'Number':\n",
" continue\n",
"\n",
" keys = list(feature_counter[k].keys())\n",
" if not keys:\n",
" continue\n",
" new_v_nat = random.choice(keys)\n",
" if (k, new_v_nat) not in nat2pm:\n",
" continue\n",
" new_v_pm = nat2pm[(k, new_v_nat)]\n",
"\n",
" if bp.tag.aspect and bp.tag.aspect == 'perf' and new_v_pm == 'pres':\n",
" continue\n",
"\n",
" if (k, new_v_nat) in nat2pm:\n",
" new_feats.append(new_v_pm)\n",
" random.shuffle(new_feats)\n",
" infl = None\n",
" newp = bp\n",
" if new_feats:\n",
" for f in new_feats:\n",
" try:\n",
" infl = newp.inflect({f})\n",
" except ValueError:\n",
" continue\n",
" if infl:\n",
" newp = infl\n",
" if newp.word != bp.word:\n",
" return newp.word\n",
" return bp.word"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 36
},
"id": "HtfSaYUgJAC2",
"outputId": "738d6e48-23e6-4b95-be11-a7955acc7d0a"
},
"source": [
"def spoil_text(doc, reinflection=0.5, important_drop=0.2, helper_drop=0.5, position_noise=2.0):\n",
" results = []\n",
" for token in doc.tokens:\n",
" if token.pos in IMPORTANT_POS:\n",
" if random.random() < important_drop:\n",
" continue\n",
" else:\n",
" if random.random() < helper_drop:\n",
" continue\n",
" word = token.text\n",
" if random.random() < reinflection:\n",
" word = reinflect(token)\n",
" results.append(word)\n",
" if position_noise:\n",
" orders = [random.normalvariate(mu=i, sigma=position_noise) for i, _ in enumerate(results)]\n",
" results = [x for _, x in sorted(zip(orders, results))]\n",
" if not results:\n",
" return random.choice(doc.tokens).text.lower()\n",
" return ' '.join(results).lower()\n",
"\n",
"spoil_text(text2doc('Шла Саша по шоссе и сосала сушку.'), reinflection=0, important_drop=0, helper_drop=0)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"'по шла саша и сосала сушку шоссе .'"
]
},
"metadata": {
"tags": []
},
"execution_count": 64
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2-8hRsbNJIje",
"outputId": "996a9080-5a5c-4a9d-864b-2e1879f45dfa"
},
"source": [
"import numpy as np\n",
"\n",
"def assemble_task():\n",
" # text = random.choice(tapaco['train'])['paraphrase']\n",
" text = random.choice(df_leipzig.text)\n",
" spoiled = spoil_text(\n",
" text2doc(text),\n",
" reinflection=random.random() * 1.0,\n",
" important_drop=random.random() * 0.5,\n",
" helper_drop=random.random() * 0.8,\n",
" position_noise=np.exp(random.normalvariate(0, 1)),\n",
" )\n",
" return f'assemble | {spoiled}', text\n",
"\n",
"assemble_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('assemble | раунде лучший можно любом по выбирается участвовать итогам результат',\n",
" 'Участвовать можно в любом раунде, по итогам выбирается лучший результат.')"
]
},
"metadata": {
"tags": []
},
"execution_count": 65
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bpmmlijRpOwi"
},
"source": [
"### Question answering"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Khr-pXY2pZds"
},
"source": [
"Take the model from http://docs.deeppavlov.ai/en/master/features/models/squad.html"
]
},
{
"cell_type": "code",
"metadata": {
"id": "PUWFD9DTpQ1E",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "40b8793c-c9ae-4ff0-bef9-3ebab0c803a1"
},
"source": [
"!wget http://files.deeppavlov.ai/datasets/sber_squad-v1.1.tar.gz"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"--2021-06-10 21:03:06-- http://files.deeppavlov.ai/datasets/sber_squad-v1.1.tar.gz\n",
"Resolving files.deeppavlov.ai (files.deeppavlov.ai)... 93.175.29.74\n",
"Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|93.175.29.74|:80... connected.\n",
"HTTP request sent, awaiting response... 301 Moved Permanently\n",
"Location: https://files.deeppavlov.ai/datasets/sber_squad-v1.1.tar.gz [following]\n",
"--2021-06-10 21:03:06-- https://files.deeppavlov.ai/datasets/sber_squad-v1.1.tar.gz\n",
"Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|93.175.29.74|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 22825727 (22M) [application/octet-stream]\n",
"Saving to: ‘sber_squad-v1.1.tar.gz’\n",
"\n",
"sber_squad-v1.1.tar 100%[===================>] 21.77M 13.9MB/s in 1.6s \n",
"\n",
"2021-06-10 21:03:08 (13.9 MB/s) - ‘sber_squad-v1.1.tar.gz’ saved [22825727/22825727]\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0xlbp70wpR-d",
"outputId": "ae5c710d-c9b8-44a9-dfed-69a8e87fccd3"
},
"source": [
"!tar -xvzf sber_squad-v1.1.tar.gz"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"dev-v1.1.json\n",
"train-v1.1.json\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "iEUE69U9pbJ5"
},
"source": [
"import json\n",
"with open('train-v1.1.json', 'r') as f:\n",
" sbsq = json.load(f)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VDtWK3hmpl5M",
"outputId": "55ee7c6d-5099-4ad8-ab6b-3bf433aa3e41"
},
"source": [
"len(sbsq['data'][0]['paragraphs'])"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"45328"
]
},
"metadata": {
"tags": []
},
"execution_count": 69
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uWM3isaSphuk",
"outputId": "fff78196-d502-4d14-b3c5-1aa445842e83"
},
"source": [
"pp = random.choice(sbsq['data'][0]['paragraphs'])\n",
"pp"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'context': 'Новым главным тренером сборной Германии стал Эрих Риббек, ставший самым возрастным тренером сборной Германии за всю её историю (на момент начала работы ему исполнился 61 год). Кроме этого, Риббек запомнился ещё тем, что проработал со сборной меньше всех других главных тренеров (с 1998 по 2000 годы); исключил из сборной ветерана Лотара Маттеуса по причине его преклонного возраста, хотя и взял его после долгих уговоров на чемпионат Европы 2000 года и, самое главное — сборная под его руководством выступила хуже всего в своей истории, проиграв и Кубок конфедераций 1999 года, и Евро 2000. В обоих случаях команда даже не преодолела групповой этап.',\n",
" 'id': '3065',\n",
" 'qas': [{'answers': [{'answer_start': 167, 'text': '61 год'}],\n",
" 'id': '31533',\n",
" 'question': 'Сколько лет было Эриху Риббеку в начале работы главным тренером?'}]}"
]
},
"metadata": {
"tags": []
},
"execution_count": 70
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "b6Q16iQgp1lc",
"outputId": "c290f6bf-b4ba-4400-a282-d4c9f750221e"
},
"source": [
"def ask_task():\n",
" pp = random.choice(sbsq['data'][0]['paragraphs'])\n",
" qq = random.choice(pp['qas'])\n",
" return f'ask | {pp[\"context\"]}', qq[\"question\"]\n",
"\n",
"ask_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('ask | Наиболее спорной является койсанская гипотеза, согласно которой в одну макросемью объединяются все не-банту языки юга Африки, проживают в государствах: Намибия (62,1 %), Ботсвана (19,6 %), Танзания (13,4 %), Ангола (2,6 %), ЮАР (1 %), Зимбабве. Их общим признаком является наличие особых щёлкающих согласных. По этому же признаку к койсанским языкам добавляются два изолированных языка с востока Африки: сандаве и хадза. Койсанские языки изучены очень слабо, причем около половины из примерно 30 языков уже вымерло, а большинство остальных находится на грани вымирания. Все это значительно затрудняет их исследование. В середине 1980-х годов на африканском континенте насчитывалось 306 тыс. человек народов, принадлежащих к этой языковой макросемье, что составляло 0,06 % от всего населения Африки. Крупнейшими народами этой макросемьи являются готтентоты — 110 тыс. чел. (36 %), горные дамара — 80 (26 %), бушмены — 75 (24,5 %) и сандаве — 40 (13 %). Ранее по этнографическому принципу эти языки делились на бушменские и готтентотские. Ныне известные койсанские языки делятся на 2 семьи, родство между которыми вполне вероятно, и 3 изолированных языка, которые могут быть и не родственны остальным:',\n",
" 'Какой общий признак у языков юга Африки?')"
]
},
"metadata": {
"tags": []
},
"execution_count": 71
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_utTc5AWqXmM",
"outputId": "6d037550-7403-47ba-d4e4-e4918b92d5d8"
},
"source": [
"def comprehend_task():\n",
" pp = random.choice(sbsq['data'][0]['paragraphs'])\n",
" qq = random.choice(pp['qas'])\n",
" aa = random.choice(qq['answers'])\n",
" return f'comprehend | {pp[\"context\"]}.\\nВопрос: {qq[\"question\"]}', aa[\"text\"]\n",
"\n",
"comprehend_task()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('comprehend | В СССР теоретические и экспериментальные исследования особенностей пуска, работы и контроля реакторов были проведены группой физиков и инженеров под руководством академика И. В. Курчатова. Первый советский реактор Ф-1 был построен в Лаборатории № 2 АН СССР (Москва). Этот реактор выведен в критическое состояние 25 декабря 1946 года. Реактор Ф-1 был набран из графитовых блоков и имел форму шара диаметром примерно 7,5 м. В центральной части шара диаметром 6 м по отверстиям в графитовых блоках размещены урановые стержни. Реактор Ф-1, как и реактор CP-1, не имел системы охлаждения, поэтому работал на очень малых уровнях мощности. Результаты исследований на реакторе Ф-1 стали основой проектов более сложных по конструкции промышленных реакторов. В 1948 году введён в действие реактор И-1 (по другим данным он назывался А-1) по производству плутония, а 27 июня 1954 года вступила в строй первая в мире атомная электростанция электрической мощностью 5 МВт в г. Обнинске..\\nВопрос: Под чьим руководством были проведены теоретические и экспериментальные исследования особенностей пуска, работы и контроля реакторов?',\n",
" 'под руководством академика И. В. Курчатова')"
]
},
"metadata": {
"tags": []
},
"execution_count": 72
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GTl0VzM8uaUG",
"outputId": "5c4ff1b4-35c6-45b1-bbdb-8c8bbe2b6326"
},
"source": [
"pd.Series([len(tokenizer.tokenize(random.choice(sbsq['data'][0]['paragraphs'])['context'])) for _ in range(1000)]).quantile([0.5, 0.75, 0.9, 0.95, 0.99, 1])"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.50 207.00\n",
"0.75 261.00\n",
"0.90 326.00\n",
"0.95 361.00\n",
"0.99 558.25\n",
"1.00 1366.00\n",
"dtype: float64"
]
},
"metadata": {
"tags": []
},
"execution_count": 73
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "HxBiq2mH-J3-",
"outputId": "d7ecc65c-48da-4f31-b795-1ca7f4a6b090"
},
"source": [
"pd.Series([len(tokenizer.tokenize(comprehend_task()[0])) for _ in range(3000)]).quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1])"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.25 198.00\n",
"0.50 232.00\n",
"0.75 288.00\n",
"0.90 355.00\n",
"0.95 390.00\n",
"0.99 500.02\n",
"1.00 1039.00\n",
"dtype: float64"
]
},
"metadata": {
"tags": []
},
"execution_count": 74
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UqFCgEDKhwsH"
},
"source": [
"### Russian SuperGLUE"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Q8jgM9PKhzq9",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "27a0694b-f970-4fbf-8ffe-e07e61fa789f"
},
"source": [
"!for TASK in LiDiRus RCB PARus MuSeRC TERRa RUSSE RWSD DaNetQA RuCoS; do wget https://russiansuperglue.com/tasks/download/$TASK --content-disposition && unzip $TASK.zip; done\n",
"!rm -rf ./*.zip"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"--2021-06-10 21:03:11-- https://russiansuperglue.com/tasks/download/LiDiRus\n",
"Resolving russiansuperglue.com (russiansuperglue.com)... 37.18.107.48\n",
"Connecting to russiansuperglue.com (russiansuperglue.com)|37.18.107.48|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 47118 (46K) [application/zip]\n",
"Saving to: ‘LiDiRus.zip’\n",
"\n",
"\rLiDiRus.zip 0%[ ] 0 --.-KB/s \rLiDiRus.zip 100%[===================>] 46.01K --.-KB/s in 0.04s \n",
"\n",
"2021-06-10 21:03:12 (1.12 MB/s) - ‘LiDiRus.zip’ saved [47118/47118]\n",
"\n",
"Archive: LiDiRus.zip\n",
" creating: LiDiRus/\n",
" inflating: LiDiRus/.DS_Store \n",
" creating: __MACOSX/\n",
" creating: __MACOSX/LiDiRus/\n",
" inflating: __MACOSX/LiDiRus/._.DS_Store \n",
" inflating: LiDiRus/LiDiRus.jsonl \n",
" inflating: __MACOSX/LiDiRus/._LiDiRus.jsonl \n",
" inflating: __MACOSX/._LiDiRus \n",
"--2021-06-10 21:03:12-- https://russiansuperglue.com/tasks/download/RCB\n",
"Resolving russiansuperglue.com (russiansuperglue.com)... 37.18.107.48\n",
"Connecting to russiansuperglue.com (russiansuperglue.com)|37.18.107.48|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 136700 (133K) [application/zip]\n",
"Saving to: ‘RCB.zip’\n",
"\n",
"RCB.zip 100%[===================>] 133.50K --.-KB/s in 0.08s \n",
"\n",
"2021-06-10 21:03:12 (1.59 MB/s) - ‘RCB.zip’ saved [136700/136700]\n",
"\n",
"Archive: RCB.zip\n",
" creating: RCB/\n",
" inflating: RCB/train.jsonl \n",
" creating: __MACOSX/RCB/\n",
" inflating: __MACOSX/RCB/._train.jsonl \n",
" inflating: RCB/.DS_Store \n",
" inflating: __MACOSX/RCB/._.DS_Store \n",
" inflating: RCB/test.jsonl \n",
" inflating: __MACOSX/RCB/._test.jsonl \n",
" inflating: RCB/val.jsonl \n",
" inflating: __MACOSX/RCB/._val.jsonl \n",
" inflating: __MACOSX/._RCB \n",
"--2021-06-10 21:03:12-- https://russiansuperglue.com/tasks/download/PARus\n",
"Resolving russiansuperglue.com (russiansuperglue.com)... 37.18.107.48\n",
"Connecting to russiansuperglue.com (russiansuperglue.com)|37.18.107.48|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 57585 (56K) [application/zip]\n",
"Saving to: ‘PARus.zip’\n",
"\n",
"PARus.zip 100%[===================>] 56.24K --.-KB/s in 0.04s \n",
"\n",
"2021-06-10 21:03:12 (1.36 MB/s) - ‘PARus.zip’ saved [57585/57585]\n",
"\n",
"Archive: PARus.zip\n",
" creating: PARus/\n",
" inflating: PARus/train.jsonl \n",
" creating: __MACOSX/PARus/\n",
" inflating: __MACOSX/PARus/._train.jsonl \n",
" inflating: PARus/.DS_Store \n",
" inflating: __MACOSX/PARus/._.DS_Store \n",
" inflating: PARus/test.jsonl \n",
" inflating: __MACOSX/PARus/._test.jsonl \n",
" inflating: PARus/val.jsonl \n",
" inflating: __MACOSX/PARus/._val.jsonl \n",
" inflating: __MACOSX/._PARus \n",
"--2021-06-10 21:03:12-- https://russiansuperglue.com/tasks/download/MuSeRC\n",
"Resolving russiansuperglue.com (russiansuperglue.com)... 37.18.107.48\n",
"Connecting to russiansuperglue.com (russiansuperglue.com)|37.18.107.48|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1196720 (1.1M) [application/zip]\n",
"Saving to: ‘MuSeRC.zip’\n",
"\n",
"MuSeRC.zip 100%[===================>] 1.14M 5.52MB/s in 0.2s \n",
"\n",
"2021-06-10 21:03:13 (5.52 MB/s) - ‘MuSeRC.zip’ saved [1196720/1196720]\n",
"\n",
"Archive: MuSeRC.zip\n",
" creating: MuSeRC/\n",
" inflating: MuSeRC/train.jsonl \n",
" creating: __MACOSX/MuSeRC/\n",
" inflating: __MACOSX/MuSeRC/._train.jsonl \n",
" inflating: MuSeRC/.DS_Store \n",
" inflating: __MACOSX/MuSeRC/._.DS_Store \n",
" inflating: MuSeRC/test.jsonl \n",
" inflating: __MACOSX/MuSeRC/._test.jsonl \n",
" inflating: MuSeRC/val.jsonl \n",
" inflating: __MACOSX/MuSeRC/._val.jsonl \n",
" inflating: __MACOSX/._MuSeRC \n",
"--2021-06-10 21:03:13-- https://russiansuperglue.com/tasks/download/TERRa\n",
"Resolving russiansuperglue.com (russiansuperglue.com)... 37.18.107.48\n",
"Connecting to russiansuperglue.com (russiansuperglue.com)|37.18.107.48|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 907346 (886K) [application/zip]\n",
"Saving to: ‘TERRa.zip’\n",
"\n",
"TERRa.zip 100%[===================>] 886.08K 4.29MB/s in 0.2s \n",
"\n",
"2021-06-10 21:03:13 (4.29 MB/s) - ‘TERRa.zip’ saved [907346/907346]\n",
"\n",
"Archive: TERRa.zip\n",
" creating: TERRa/\n",
" inflating: TERRa/train.jsonl \n",
" creating: __MACOSX/TERRa/\n",
" inflating: __MACOSX/TERRa/._train.jsonl \n",
" inflating: TERRa/.DS_Store \n",
" inflating: __MACOSX/TERRa/._.DS_Store \n",
" inflating: TERRa/test.jsonl \n",
" inflating: __MACOSX/TERRa/._test.jsonl \n",
" inflating: TERRa/val.jsonl \n",
" inflating: __MACOSX/TERRa/._val.jsonl \n",
" inflating: __MACOSX/._TERRa \n",
"--2021-06-10 21:03:13-- https://russiansuperglue.com/tasks/download/RUSSE\n",
"Resolving russiansuperglue.com (russiansuperglue.com)... 37.18.107.48\n",
"Connecting to russiansuperglue.com (russiansuperglue.com)|37.18.107.48|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 3806009 (3.6M) [application/zip]\n",
"Saving to: ‘RUSSE.zip’\n",
"\n",
"RUSSE.zip 100%[===================>] 3.63M 12.8MB/s in 0.3s \n",
"\n",
"2021-06-10 21:03:14 (12.8 MB/s) - ‘RUSSE.zip’ saved [3806009/3806009]\n",
"\n",
"Archive: RUSSE.zip\n",
" creating: RUSSE/\n",
" inflating: RUSSE/train.jsonl \n",
" creating: __MACOSX/RUSSE/\n",
" inflating: __MACOSX/RUSSE/._train.jsonl \n",
" inflating: RUSSE/.DS_Store \n",
" inflating: __MACOSX/RUSSE/._.DS_Store \n",
" inflating: RUSSE/test.jsonl \n",
" inflating: __MACOSX/RUSSE/._test.jsonl \n",
" inflating: RUSSE/val.jsonl \n",
" inflating: __MACOSX/RUSSE/._val.jsonl \n",
" inflating: __MACOSX/._RUSSE \n",
"--2021-06-10 21:03:14-- https://russiansuperglue.com/tasks/download/RWSD\n",
"Resolving russiansuperglue.com (russiansuperglue.com)... 37.18.107.48\n",
"Connecting to russiansuperglue.com (russiansuperglue.com)|37.18.107.48|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 40508 (40K) [application/zip]\n",
"Saving to: ‘RWSD.zip’\n",
"\n",
"RWSD.zip 100%[===================>] 39.56K --.-KB/s in 0s \n",
"\n",
"2021-06-10 21:03:14 (326 MB/s) - ‘RWSD.zip’ saved [40508/40508]\n",
"\n",
"Archive: RWSD.zip\n",
" creating: RWSD/\n",
" inflating: RWSD/train.jsonl \n",
" creating: __MACOSX/RWSD/\n",
" inflating: __MACOSX/RWSD/._train.jsonl \n",
" inflating: RWSD/.DS_Store \n",
" inflating: __MACOSX/RWSD/._.DS_Store \n",
" inflating: RWSD/test.jsonl \n",
" inflating: __MACOSX/RWSD/._test.jsonl \n",
" inflating: RWSD/val.jsonl \n",
" inflating: __MACOSX/RWSD/._val.jsonl \n",
" inflating: __MACOSX/._RWSD \n",
"--2021-06-10 21:03:14-- https://russiansuperglue.com/tasks/download/DaNetQA\n",
"Resolving russiansuperglue.com (russiansuperglue.com)... 37.18.107.48\n",
"Connecting to russiansuperglue.com (russiansuperglue.com)|37.18.107.48|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1293761 (1.2M) [application/zip]\n",
"Saving to: ‘DaNetQA.zip’\n",
"\n",
"DaNetQA.zip 100%[===================>] 1.23M 5.02MB/s in 0.2s \n",
"\n",
"2021-06-10 21:03:14 (5.02 MB/s) - ‘DaNetQA.zip’ saved [1293761/1293761]\n",
"\n",
"Archive: DaNetQA.zip\n",
" creating: DaNetQA/\n",
" inflating: DaNetQA/train.jsonl \n",
" creating: __MACOSX/DaNetQA/\n",
" inflating: __MACOSX/DaNetQA/._train.jsonl \n",
" inflating: DaNetQA/.DS_Store \n",
" inflating: __MACOSX/DaNetQA/._.DS_Store \n",
" inflating: DaNetQA/test.jsonl \n",
" inflating: __MACOSX/DaNetQA/._test.jsonl \n",
" inflating: DaNetQA/val.jsonl \n",
" inflating: __MACOSX/DaNetQA/._val.jsonl \n",
" inflating: __MACOSX/._DaNetQA \n",
"--2021-06-10 21:03:14-- https://russiansuperglue.com/tasks/download/RuCoS\n",
"Resolving russiansuperglue.com (russiansuperglue.com)... 37.18.107.48\n",
"Connecting to russiansuperglue.com (russiansuperglue.com)|37.18.107.48|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 56208297 (54M) [application/zip]\n",
"Saving to: ‘RuCoS.zip’\n",
"\n",
"RuCoS.zip 100%[===================>] 53.60M 9.57MB/s in 5.2s \n",
"\n",
"2021-06-10 21:03:20 (10.4 MB/s) - ‘RuCoS.zip’ saved [56208297/56208297]\n",
"\n",
"Archive: RuCoS.zip\n",
" creating: RuCoS/\n",
" inflating: RuCoS/train.jsonl \n",
" creating: __MACOSX/RuCoS/\n",
" inflating: __MACOSX/RuCoS/._train.jsonl \n",
" inflating: RuCoS/.DS_Store \n",
" inflating: __MACOSX/RuCoS/._.DS_Store \n",
" inflating: RuCoS/test.jsonl \n",
" inflating: __MACOSX/RuCoS/._test.jsonl \n",
" inflating: RuCoS/val.jsonl \n",
" inflating: __MACOSX/RuCoS/._val.jsonl \n",
" inflating: __MACOSX/._RuCoS \n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "VqNKd0gqiVEt"
},
"source": [
"import codecs\n",
"import json\n",
"import pandas as pd\n",
"from collections import defaultdict\n",
"import copy\n",
"import random\n",
"import numpy as np\n",
"import re\n",
"\n",
"def load_jsonl(path):\n",
" with codecs.open(path, encoding='utf-8-sig') as reader:\n",
" lines = reader.read().split(\"\\n\")\n",
" lines = list(map(json.loads, filter(None, lines)))\n",
" return lines"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_8_SvedsiaOj",
"outputId": "55232599-82a5-4a92-8bcc-5131f33f99c2"
},
"source": [
"TASK_NAMES = ['LiDiRus', 'RCB', 'PARus', 'MuSeRC', 'TERRa', 'RUSSE', 'RWSD', 'DaNetQA', 'RuCoS']\n",
"\n",
"task_data = {k: load_jsonl(f'{k}/train.jsonl') for k in TASK_NAMES[1:]}\n",
"\n",
"task_ids = list(task_data.keys())\n",
"task_weights = np.array([len(task_data[t]) for t in task_ids])**0.5\n",
"task_weights /= task_weights.sum()\n",
"print(task_ids)\n",
"print(task_weights)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"['RCB', 'PARus', 'MuSeRC', 'TERRa', 'RUSSE', 'RWSD', 'DaNetQA', 'RuCoS']\n",
"[0.03544588 0.03387339 0.0378716 0.08662586 0.23859111 0.04169318\n",
" 0.07083104 0.45506794]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "fAJFsxGbia-G"
},
"source": [
"RCB_labels = {\n",
" 'contradiction': 'противоречие',\n",
" 'entailment': 'следствие',\n",
" 'neutral': 'не очевидно'\n",
"}\n",
"\n",
"def RCB_sampler(item):\n",
" return f'Дано: {item[\"premise\"]}. Гипотеза: {item[\"hypothesis\"]}. Логично?', RCB_labels[item[\"label\"]]\n",
"\n",
"def PARus_sampler(item):\n",
" if item['question'] == 'cause':\n",
" q, a = 'Следствие', 'Причина'\n",
" else: # effect\n",
" q, a = 'Причина', 'Следствие'\n",
" idx = random.choice([0, 1])\n",
" ch = item['choice' + str(idx + 1)]\n",
" ans = 'да' if item['label'] == idx else 'нет'\n",
" parts = [f'{q}: {item[\"premise\"]}', f\"{a}: {ch}\"]\n",
" random.shuffle(parts)\n",
" return f'{parts[0]}. {parts[1]}. Логично?', ans\n",
"\n",
"def MuSeRC_sampler(item):\n",
" text = re.sub('\\(\\d+\\) ', '', item['passage']['text'])\n",
" qq = random.choice(item['passage']['questions'])\n",
" q = qq['question']\n",
" a = random.choice([a for a in qq['answers'] if a['label']])['text']\n",
" return f'Вопрос: {q}. Дано: {text}', a\n",
"\n",
"TERRa_labels = {\n",
" 'entailment': 'логично',\n",
" 'not_entailment': 'не очевидно'\n",
"}\n",
"\n",
"def TERRa_sampler(item):\n",
" return f'Дано: {item[\"premise\"]}. Гипотеза: {item[\"hypothesis\"]}. Логично?', TERRa_labels[item[\"label\"]]\n",
"\n",
"def RUSSE_sampler(item):\n",
" sents = [item['sentence1'], item['sentence2']]\n",
" random.shuffle(sents)\n",
" ans = 'да' if item['label'] else 'нет'\n",
" return f'Слово \"{item[\"word\"]}\" употребляется в одинаковом смысле? Текст 1: {sents[0]}. Текст 2: {sents[1]}.', ans\n",
"\n",
"def RWSD_sampler(item):\n",
" s1, s2 = item['target']['span1_text'], item['target']['span2_text']\n",
" if item['label'] and random.random() < 0.5:\n",
" return f'{item[\"text\"]}. К чему относится \"{s2}\"?', s1\n",
" return f'{item[\"text\"]}. \"{s2}\" относится к \"{s1}\"?', 'да' if item['label'] else 'нет'\n",
"\n",
"def DaNetQA_sampler(item):\n",
" text = item[\"passage\"]\n",
" q = item[\"question\"].replace('?', '')\n",
" ans = 'да' if item['label'] else 'нет'\n",
" if random.random() < 0.5:\n",
" return f'{text}. Вопрос: {q}?', ans\n",
" else:\n",
" return f'Вопрос: {q}? {text}', ans\n",
"\n",
"def RuCoS_sampler(item):\n",
" text = item[\"passage\"]['text'].split('@highlight')[0].strip()\n",
" qa = random.choice(item['qas'])\n",
" q, a = qa['query'], random.choice(qa['answers'])['text']\n",
" if random.random() < 0.5: #summarization\n",
" return text + '\\nВкратце:', q.replace('@placeholder', a)\n",
" else:\n",
" summary = q.replace('@placeholder', 'ЭТО')\n",
" return f'{summary}. {text}. Вопрос: ЭТО - что?', a\n",
"\n",
"samplers = {\n",
" 'RCB': RCB_sampler,\n",
" 'PARus': PARus_sampler,\n",
" 'MuSeRC': MuSeRC_sampler,\n",
" 'TERRa': TERRa_sampler,\n",
" 'RUSSE': RUSSE_sampler,\n",
" 'RWSD': RWSD_sampler,\n",
" 'DaNetQA': DaNetQA_sampler,\n",
" 'RuCoS': RuCoS_sampler,\n",
"}\n",
"\n",
"def rsg_task(task=None):\n",
" if task is None:\n",
" task = random.choices(task_ids, weights=task_weights)[0]\n",
" q, a = samplers[task](random.choice(task_data[task]))\n",
" return f'{task} | {q}', a"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fhUqIMy6idsx",
"outputId": "1fa1ed53-4098-4d8f-8fc5-6dab84a93024"
},
"source": [
"for t in task_ids:\n",
" q, a = rsg_task(task=t)\n",
" print(q)\n",
" print(a)\n",
" print()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"RCB | Дано: Квартира ему не понравилась, он мне долго объяснял, почему — как я понял, вид из окна был не тот, мешал ему работать. Получил квартиру рядом, в другом таком же доме — кстати, при входе в этот столь респектабельный дом на него, стараясь отобрать сумку, напал грабитель и нанес ему несколько ран. Началась эпопея ремонта и приведения квартиры в тот образцовый порядок, который он считал необходимым, единственно возможным.. Гипотеза: Образцовый порядок необходим, единственно возможен.. Логично?\n",
"не очевидно\n",
"\n",
"PARus | Причина: Мальчик боролся со своим старшим братом.. Следствие: Мальчик подражал старшему брату.. Логично?\n",
"нет\n",
"\n",
"MuSeRC | Вопрос: Почему Батлер хочет убить Фрэнка?. Дано: Чикагский пожар 7 октября 1871 года в момент разрушает финансовое благополучие Каупервуда. Огонь, охвативший торговую часть города, вызывает биржевую панику. Фрэнк пустил в оборот пятьсот тысяч из городской казны, и теперь это самая большая его проблема. Стинера нет в городе, и Каупервуд решает начистоту всё рассказать Батлеру, надеясь, что он с Молленхауэром и Симпсоном не допустят обесценивания бумаг. Банкир также упоминает грядущие выборы: исчезновение из казны такой огромной суммы быстро раскроется, и Стинер как представитель республиканцев бросит тень на партию. Батлер сводит его с двумя другими тузами. Однако каждый видит в происходящем свою выгоду и, хотя они сочувствуют Фрэнку, не считают для себя необходимым помогать ему. В это же время Батлер получает анонимное письмо о том, что Эйлин путается с Каупервудом. Реакция дочери на письмо подтверждает написанное. Старого ирландца в момент охватывает ненависть к Фрэнку. Батлер решает утопить негодяя, воспользовавшись предстоящей угрозой банкротства Каупервуда.\n",
"У Каупервуда роман с его дочерью.\n",
"\n",
"TERRa | Дано: \"В отеле 362 номера, включая 72 люкса; 10 залов для мероприятий, в том числе конференц-зал на 300 человек; два ресторана. Ранее руководитель департамента имущества Москвы Наталья Бочарова сообщила, что \"\"Метрополь\"\" после продажи сохранит свое назначение. \"\"Победитель аукциона вместе с договором купли-продажи должен будет подписать охранное обязательство, поскольку гостиница является объектом культурного наследия\"\", - сказала руководитель департамента.\". Гипотеза: \"Отель \"\"Метрополь\"\" утратил статус объекта культурного наследия.\". Логично?\n",
"не очевидно\n",
"\n",
"RUSSE | Слово \"завеса\" употребляется в одинаковом смысле? Текст 1: Завесы из фиолетовых и пурпурных тканей висели от пола до потолка при входе в залу. Текст 2: Этим ледяным словом и вспыхнувшим в ночи видением финки, зажатой в кулак, он как бы приоткрыл завесу своей холодной жестокой души.\n",
"нет\n",
"\n",
"RWSD | Бетт не рассердилась на Салли, которая ее перебила, потому что она остановилась и попросила прощения.. \"она остановилась\" относится к \"Салли\"?\n",
"да\n",
"\n",
"DaNetQA | Вопрос: Вводился ли комендантский час в детройте? Вандализмом оказалась охвачена восточная часть Вудворд-авеню, область вокруг восточной части Гранд-Бульвар, которая идёт на восток/запад, а затем на север/юг до Белл-Айл. С воскресенья 23 июля по четверг 27 июля в беспорядки был вовлечён почти весь город. В Детройте был введён общегородской комендантский час, запрещены продажа алкоголя, огнестрельного оружия, было неофициально сокращено рабочее время из-за гражданских волнений, которые захватили все районы города. Хотя в волнениях участвовали некоторые белые, их движущей силой выступили чёрные американцы, воспринявшие ответные действия властей как форму расизма. Чтобы подавить бунт и пресечь нарушения порядка, губернатор Джордж Ромни вызвал подразделения внутренних войск Национальной гвардии штата Мичиган, в город по приказу президента Линдона Б. Джонсона были введены армейские части: 82-я и 101-я воздушно-десантная дивизии.\n",
"да\n",
"\n",
"RuCoS | В ЭТО, где проанализировали доходы 85 тысяч человек, также установили, что в восточных землях (Саксония-Анхальт, Лейпциг) рабочие зарабатывают более чем в полтора раза меньше, чем те, кто трудится в западной части страны (Баден-Вюртемберге, Гессене, Бремене).. Средняя зарплата рабочих в Германии в 2019 году составляет 43 тысячи евро в год, сообщает Deutsche Welle со ссылкой на исследование портала StepStone. Значительно больше — 58 тысяч евро — получают сотрудники с квалификацией Meister или Techniker, в подчинении которых находится несколько работников. На российском рынке труда к такой квалификации отчасти соответствуют люди, окончившие колледж по специальности «техник». Меньше всех зарабатывают садовники — около 32 тысяч евро. Машинисты — 43,6 тысячи евро. Рабочие и инженеры, которые трудятся в крупных немецких компаниях, как правило, зарабатывают на 20 процентов больше, чем фирмы с численностью сотрудников до 500 человек.. Вопрос: ЭТО - что?\n",
"StepStone\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2OuGcE1OHNln"
},
"source": [
"### Train the model"
]
},
{
"cell_type": "code",
"metadata": {
"id": "kUIfwpQDHSHQ"
},
"source": [],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "BrqtbbrsBYqb"
},
"source": [
"# raw_model = '/gd/MyDrive/models/rut5-base-raw' # start fine-tuning\n",
"raw_model = '/gd/MyDrive/models/rut5-base-partial' # continue fine-tuning\n",
"model = T5ForConditionalGeneration.from_pretrained(raw_model)\n",
"tokenizer = T5Tokenizer.from_pretrained(raw_model)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "moA8I4I0KHHZ"
},
"source": [
"device = torch.device('cuda')\n",
"model.to(device);"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "LUnU1J75Cr3x"
},
"source": [
"optimizer = torch.optim.Adam(params = [p for p in model.parameters() if p.requires_grad], lr=1e-5)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "LNuz-_TBioM5"
},
"source": [
"```\n",
"translate_task 1.56\n",
"paraphrase_task 1.55\n",
"fill_gap_task 3.21\n",
"assemble_task 1.51\n",
"simplify_task 1.10\n",
"reply_task 3.27\n",
"answer_task 3.91\n",
"ask_task 1.38\n",
"comprehend_task 0.38\n",
"headline_task 1.71\n",
"quiz_task 4.94\n",
"```"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hKg8G2tSH4Bq",
"outputId": "b4834946-fedb-4a55-a1b6-81ff9f9efe62"
},
"source": [
"TASKS = [\n",
" quiz_task,\n",
" answer_task,\n",
" reply_task,\n",
" fill_gap_task,\n",
" assemble_task,\n",
" translate_task,\n",
" headline_task,\n",
" paraphrase_task,\n",
" ask_task,\n",
" rsg_task,\n",
" simplify_task,\n",
" comprehend_task,\n",
"]\n",
"# omit sumarize_task because texts are too long\n",
"len(TASKS)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"12"
]
},
"metadata": {
"tags": []
},
"execution_count": 83
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "uowD1nQ0V613"
},
"source": [
"def predict(x, n=3):\n",
" inputs = tokenizer(x, return_tensors='pt')\n",
" inputs = {k: v.to(model.device) for k, v in inputs.items()}\n",
" with torch.no_grad():\n",
" hypotheses = model.generate(\n",
" **inputs,\n",
" do_sample=True,\n",
" top_p=0.9,\n",
" num_return_sequences=3,\n",
" repetition_penalty=2.5,\n",
" max_length=64,\n",
" )\n",
" return [tokenizer.decode(h, skip_special_tokens=True) for h in hypotheses]"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pIhWwOpdVwlT",
"outputId": "b6aec26b-626f-4b00-b1f6-9351bc863da8"
},
"source": [
"model.eval()\n",
"\n",
"for t in TASKS:\n",
" x, y = t()\n",
" print(x, ' \\n --> ', y)\n",
" for p in predict(x, n=3):\n",
" print(p)\n",
" print()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"quiz | Спортивная встреча для установления первенства по виду спорта \n",
" --> состязание\n",
"воркаут\n",
"на льду\n",
"атлетика\n",
"\n",
"answer | Какие празднечные конкурсы или игры вы знаете? \n",
" --> вопрос-ответ, на одной бумажке пишется вопрос, на другой - ответ, бумажек лучше много, вопросы кладутся в одну шапку, ответы - в другую. Гости по-очереди вытаскивают сначала вопрос, говорит, кому отвечать и другой вытаскивает ответ, получается смешно! К примеру, вопрос: \"Вы свою жену любите? \" а ответ на это попался: \"Только в большой компании! \" - это у нас так было. Вот тебе примерные вопросы и ответы: Игра «Вопрос, ответ» . <br> <br>Вариант 1 <br>Вопросы: <br> <br>1.Хотели бы вы стать миллионером? <br>2.Вам хотелось бы попасть в гарем? <br>3.Вы часто бываете в ресторане? <br>4.Вы с удовольствием ходите на работу? <br>5.Могли бы вы родить богатыря? <br>6.Носите ли вы парик? <br>7.У вас есть недостатки? <br>8.Вы любите музыку? <br>9.Вы всегда так вежливы как сегодня? <br>10.Способны ли вы любить? <br>11.Вы с удовольствием выпиваете? <br>12.Вы любите танцевать? <br>13.Вы хотите сесть рядом с кем-нибудь (называется имя) ? <br>14.Вы любите детей? <br>15.Вы можете быть верным (верной) ? <br>16.Вы часто ходите на свидание? <br>17.В пьяном виде вы ведете себя прилично? <br>18.Вы меня любите? <br>19.Вы ревнивы? <br>20.Вы всегда так много едите? <br>21.Верите ли вы в чудеса? <br>22.Вы не жалеете что женаты (замужем) ? <br>23.Хотели бы вы сегодня напиться? <br>24.Вы скромны? <br>25.Хотели бы вы иметь новых друзей? <br>Ответы: (вариант1) <br>1. Нет, однажды я уже попробовал, но ничего не получилось <br>2. Это мое хобби <br>3. Только на грани отчаяния <br>4. Мы ведь не ангелы, всякое бывает <br>5. Кто же из нас не без греха <br>6. Еще как <br>7. Это для меня истинное удовольствие <br>8. Только в минуты слабости <br>9. Я не способен (не способна) на такую глупость <br>10. Это моя мечта <br>11. Нет, я слишком хорошо воспитан (воспитана) <br>12. Я вам отвечу, когда останемся вдвоем <br>13. Уж лучше я промолчу <br>14. Только в бане <br>15. Каждый второй день <br>16. Ни в коем случае <br>17. Только в трезвом виде <br>18. Если на улице холодно <br>19. Систематически <br>20. В дни получки <br>21. В субботу это просто необходимо <br>22. От нечего делать <br>23. Только в доме отдыха или на курорте <br>24. Об этом в слух не говорят <br>25. В обеденный перерыв <br>\n",
"Сегодня праздничные конкурсы.\n",
"Лучше какие-нибудь фестивали!\n",
"А у нас конечно все замечательно....\n",
"\n",
"reply | Я пошел.\n",
"\n",
"Куда? \n",
" --> Посмотреть на место действия.\n",
"Все.\n",
"Как уехала?\n",
"Не в никуда.\n",
"\n",
"fill | Странный ___ эксперимент по созданию притяжения и выстраиванию орбит между небесными телами, которые, при удачном стечении обстоятельств, имеют шанс стать новой солнечной системой или даже галактикой. \n",
" --> космический\n",
"и необычный\n",
"и интересный\n",
"из них был\n",
"\n",
"assemble | видимо желай этого оставаться забвение добровольный \n",
" --> Но, видимо, не желаешь этого, раз остаешься в добровольном забвении.\n",
"Видимо этого желают оставаться добровольным.\n",
"Что видимо желают сохраняться этим, вспоминает она из-за забвения.\n",
"Видимо, желая этого оставаться добровольным.\n",
"\n",
"translate en-ru | 2.41% of the population were Hispanic or Latino of any race. \n",
" --> 2,41 % населения — латиноамериканцы всех рас.\n",
"2.41% населения составляли Hispanic или Latinо of any race.\n",
"2.41% населения составляли Hispanic или Latino.\n",
"2.41% населения были Испанцы или латиноамериканцами.\n",
"\n",
"headline | Медицинский сервис DocDoc, входящий в экосистему Сбербанка, запустил бесплатный сервис поддержки людей старшего поколения. С его помощью можно организовать доставку продуктов и лекарств, связаться с врачами и научиться пользоваться полезными цифровыми сервисами. Также есть возможность получить психологическую помощь или просто поговорить. Сервис работает в двух форматах: На базе сервиса доступны 5 направлений, которыми можно воспользоваться по телефону. Все сервисы предоставляются бесплатно благодаря финансовой поддержке Сбербанка. Эти направления были выбраны на основе данных, полученных в ходе работы горячей линии DocDoc по коронавирусу. За два месяца на нее обратилось более 200 тысяч человек. Дмитрий Петрухин, генеральный директор сервиса DocDoc: Компания DocDoc начала работу в 2012 году как первый в России сервис по выбору врачей на основе проверенных отзывов самих пациентов. В 2017 году DocDoc на 80% вошел в Группу Сбербанк, после чего сервис начал трансформацию в медицинскую платформу — площадку, на которой представлены самые востребованные медицинские услуги: запись к врачу и телемедицина. Сервис телемедицины DocDoc (онлайн-консультации с врачами) был запущен в 2018 году. На сервисе есть возможность в любое время связаться онлайн с дежурным терапевтом/педиатром либо, по предварительной записи, с врачами более 40 направлений. На данном этапе в проекте DocDoc участвует более 4 000 частных клиник. Общее число клиентов DocDoc сегодня превышает 7,5 млн человек. ПАО Сбербанк. На правах рекламы \n",
" --> Сбербанк и DocDoc запускают сервис поддержки старшего поколения в период пандемии\n",
"Сбербанк вошел к экосистеме DocDoc\n",
"DocDoc запустил бесплатный сервис поддержки людей старшего поколения\n",
"DocDoc запустил сервис поддержки людей старшего поколения\n",
"\n",
"paraphrase | Тому не стоит ехать туда одному. \n",
" --> Тому не стоит ходить туда одному.\n",
"Тому не нужно ходить один.\n",
"Тому не стоит ехать одновременно.\n",
"Тому не стоит ехать один домой.\n",
"\n",
"ask | Благовещенск лежит на одной параллели с Киевом и российским Черноземьем, несмотря на это зимы здесь более продолжительные и значительно более холодные. Погода в Благовещенске, ввиду очень небольшой теплоёмкости воздуха, в температурном режиме очень зависит от продолжительности солнечного сияния и поступающего солнечного тепла. Поэтому декабрь холоднее февраля, а июнь лишь чуть холоднее, чем август. В Благовещенске континентальный вариант умеренного муссонного климата. Континентальность климата проявляется в большой годовой (43°С) и суточной (10-15°С) амплитуде температуры. Муссонность климата выражается в направлении сезонных ветров, активной циклонической деятельности и большом количестве осадков в теплое время года. Лето жаркое со значительным количеством солнечного сияния. Зима холодная, сухая, с маломощным снежным покровом. Температурный рекорд был зафиксирован 25 июня 2010 года, когда температура воздуха в городе поднялась до отметки +39,4 °C[25][26]. \n",
" --> Какая дата является температурным максимумом в Благовещенске?\n",
"Чем проявляется континентальность климата в Благовещенске?\n",
"На каком параллели лежит Благовещенск?\n",
"В каком климате лежит Благовещенск?\n",
"\n",
"RuCoS | Бывший советник избирательного штаба нынешнего президента США Дональда Трампа Джордж Пападопулос солгал в разговоре с ФБР. Об этом сообщается в судебных документах, обнародованных властями США. Ложь Пападопулоса касалась его встречи с неназванным профессором, имеющим связи с российским правительством. 14 марта 2016 года в Лондоне Пападопулос, уже будучи советником Трампа, поговорил с ним. Мужчина рассказал американцу, что у российских властей есть компромат на кандидата от демократов Хиллари Клинтон. Соратник Трампа также с помощью этого профессора пытался организовать встречу работников штаба политика с российскими официальными лицами.\n",
"Вкратце: \n",
" --> Кроме того, агенты выяснили, что Пападопулос лгал насчет встречи с женщиной, которую в разговорах с соратниками он описывал как «племянницу Путина» (речь идет о президенте России Владимире Путине — прим.\n",
"Пападопулос также заявлял, что он сотрудничал с ФБР в марте 2016 года.\n",
"В январе 2016 года в Нью-Йорке отметил, что после обсуждения конфликта со стороны ФБР невозможно сообщить информацию на данный момент.\n",
"В марте 2016 года в Лондона утром 7 мая проводился выборный процесс с представителями ФБР, находящихся под руководством Трампа.\n",
"\n",
"simplify | Холидей записывалась для Колумбии в конце 1930-х годов, когда ей представили «Странный плод», песню, основанную на стихотворении о линчевании, написанном Абелем Мерополем, еврейским школьным учителем из Бронкса. \n",
" --> Она слышала о песне под названием «Странные фрукты». Он был основан на стихотворении о линчевании, написанном Абелем Мерополем, еврейским школьным учителем из Бронкса.\n",
"Холидей записывалась для Колумбии в конце 1930-х, на стихотворениях Абелем Мерополом.\n",
"Холидей записывалась для Колумбии в конце 1930-х лет. Ее представили «Странный плод», песню, основанную на стихотворениях Абелем Мерополей из Бронкса на нем написанное по тексту Эдель Франтесса Измания\n",
"Холидей записывалась для Колумбии в конце 1930-х, приняв концерт «Странный плод».\n",
"\n",
"comprehend | У многих мелких рачков с тонким карапаксом нет жабр, а дыхание идёт через всю поверхность тела. У сухопутных ракообразных имеются специальные приспособления для дыхания атмосферным кислородом, например, псевдотрахеи (глубокие впячивания) на брюшных ножках мокриц. Полость конечности заполнена гемолимфой, омывающей впячивания и осуществляющей газообмен. Сухопутные крабы дышат кислородом, растворённым в воде, покрывающей тонкой плёнкой мембраны жаберной полости и защищённой от испарения карапаксом. Однако для дыхания сухопутным ракообразным всё равно необходима повышенная влажность воздуха..\n",
"Вопрос: Где располагаются псевдотрахеи у мокриц? \n",
" --> На брюшных ножках.\n",
"на брюшных ножках\n",
"на брюшных ножках\n",
"на брюшных ножках\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ufWOVuaHLGQ3",
"outputId": "6790dc1f-22ec-4640-ff86-155230163add"
},
"source": [
"for t in task_ids:\n",
" q, a = rsg_task(task=t)\n",
" print(q)\n",
" print(a)\n",
" for p in predict(q, n=3):\n",
" print(p)\n",
" print()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"RCB | Дано: И тут на меня нашло. Как помню, в первый раз жизни защемило сердце. Позже стал осознавать, что так проявляет себя моя тревожность.. Гипотеза: Так проявляет себя моя тревожность.. Логично?\n",
"следствие\n",
"следствие\n",
"не очевидно\n",
"следствие\n",
"\n",
"PARus | Следствие: Пожарная сигнализация сработала.. Причина: Я зажёг свечу.. Логично?\n",
"нет\n",
"нет\n",
"нет\n",
"да\n",
"\n",
"MuSeRC | Вопрос: Почему Эмпедокл терзается и чувствует себя бессильным и опустошённым?. Дано: Мужчины со злорадством рассуждают: сдал Эмпедокл, и поделом ему. Слишком много возомнил о себе, открыл черни божественные тайны, которым надлежало оставаться достоянием одних жрецов. Вредным было его влияние на народ — все эти дерзкие речи о новой жизни, которая должна заменить старый, привычный быт, призывы не покоряться исконным обычаям и традиционным верованиям. Человек не должен нарушать положенные ему пределы, бунтарство обернулось для Эмпедокла поражением. Поскольку он удалился от всех, прошла молва, что боги взяли его живым на небо. Народ привык считать Эмпедокла пророком, чародеем, полубогом, необходимо низвергнуть его с пьедестала, изгнать из города. Пусть сограждане увидят его сломленным духом, утратившим былое красноречие и необыкновенные способности, тогда ничего не будет стоить восстановить их против Эмпедокла. Эмпедокл терзается — похоже, гордыня сгубила его, бессмертные не простили ему попытки стать с ними наравне, отвернулись от него. Он чувствует себя бессильным и опустошённым — он подчинил себе природу, овладев ее тайнами, но после этого видимый мир лишился в его глазах красоты и обаяния, все в нем кажется теперь мелочным и недостойным внимания. К тому же он так и остаётся непонятым соотечественниками, хоть те ему и поклоняются. Ему так и не удалось поднять их на высоту своей мысли.\n",
"Он чувствует себя бессильным и опустошённым — он подчинил себе природу, овладев её тайнами, но после этого видимый мир лишился в его глазах красоты и обаяния, всё в нем кажется теперь мелочным и недостойным внимания.\n",
"Это сгубила себя, бессмертные не простили Эмпедокло попытки стать со своими наравне.\n",
"Чего угнетал Эмпедокл?\n",
"Чтобы не простить бессмертных попытки стать с им наравне, отвернулись от него.\n",
"\n",
"TERRa | Дано: Но согласно постановления Верховного Совета СССР еще от ноября 1990 года, женщины на селе должны работать лишь 36 часов в неделю. Добровольно ей платить компенсацию за переработку отказались. Суд встал на сторону обиженной сотрудницы, но частично, так как были пропущены сроки обращения.. Гипотеза: Женщина на селе переработала.. Логично?\n",
"логично\n",
"не очевидно\n",
"не очевидно\n",
"не очевидно\n",
"\n",
"RUSSE | Слово \"достижение\" употребляется в одинаковом смысле? Текст 1: Вдаваться в детали ему не хотелось, но он понимал, что для достижения результата она использует самые разные препараты. Текст 2: Если есть прогресс, благо человечества, значит, та наука, которая направлена на достижение некоторого условного блага […] нравственна, а которая это благо не имеет в виду – пусть провалится.\n",
"да\n",
"да\n",
"да\n",
"нет\n",
"\n",
"RWSD | Я уверен, что на моем плане будет отмечен этот дом. Он очень подробный.. \"Он\" относится к \"дом\"?\n",
"нет\n",
"да\n",
"нет\n",
"нет\n",
"\n",
"DaNetQA | Вопрос: Болеют ли животные раком? Го́лый землеко́п — небольшой роющий грызун семейства землекоповых. Вид отличается уникальными для млекопитающих особенностями: сложной социальной организацией колонии, холоднокровностью, нечувствительностью к некоторым формам боли , выносливостью к высоким концентрациям CO2. Изначально считалось, что особи обладают иммунитетом к раку в целом, но в феврале 2016 года американские учёные сообщили о двух зарегистрированных у голых землекопов случаях рака . Живёт на порядок дольше других грызунов подобного размера . Активность репарации повреждённых оснований и нуклеотидов в клетках голого землекопа гораздо выше, чем в клетках мыши и может быть ответственна за то, что продолжительность жизни этого грызуна достигает 30 лет .\n",
"да\n",
"нет\n",
"нет\n",
"да\n",
"\n",
"RuCoS | Инициатор экологического движения \"Пятницы ради будущего\" решила полностью посвятить себя борьбе за защиту климата. 16-летняя шведская активистка после девятого класса пропустит в школе целый год. @header В борьбе против глобального потепления нужны срочные меры\n",
"Поскольку Грета Тунберг по экологическим причинам не летает на самолетах, ее семья изыскивает альтернативы для ее поездки через Атлантику. Одним из вариантов является путешествие на морском лайнере. Мир удивлен массовыми протестами школьников. Между тем экологическое движение Fridays for Future планирует очередную акцию с рекордным числом участников.\n",
"Вкратце:\n",
"Грета Тунберг - инициатор экологического движения, называемого в Германии Fridays for Future (\"Пятницы ради будущего\").\n",
"Гитла Тунберг намерена продолжить в школе целый год, отправляясь на побережье.\n",
"Большинство студентов, которых увлекает экологическая деятельность после девятого класса выросли в школе за счет поездки на морской самолет.\n",
"Что заботится Green Sun, который также вынужден быть вместо этого?\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Mn14elfSKOxy"
},
"source": [
"import gc\n",
"\n",
"def cleanup():\n",
" gc.collect()\n",
" torch.cuda.empty_cache()\n",
"\n",
"cleanup()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ORd8PLepNpCm"
},
"source": [
"optimizer.param_groups[0]['lr'] = 1e-5"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "he9d-PVH55Q8"
},
"source": [
"The easiest tasks seem to be `simplify` and `comprehend` (because the correct response is obvious from the text and is short), the most difficult are `translate` (probably because languages are not aligned), `fill_gap` (surprisingly, because T5 was trained to do it), `reply` and `answer` (okay, here the answer is unpredictable)."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3bzXl50I3_ND",
"outputId": "9f28c9ac-af98-42ac-b32e-ddfb9750d8c4"
},
"source": [
"def eval_losses(n=10, max_len=1024):\n",
" for task in TASKS:\n",
" tot = 0\n",
" for i in range(n):\n",
" xxx, yyy = task()\n",
" x = tokenizer(xxx, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)\n",
" y = tokenizer(yyy, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)\n",
"\n",
" loss = model(\n",
" input_ids=x.input_ids,\n",
" attention_mask=x.attention_mask,\n",
" labels=y.input_ids,\n",
" decoder_attention_mask=y.attention_mask,\n",
" return_dict=True\n",
" ).loss\n",
" loss.backward()\n",
" tot += loss.item()\n",
" print(f'{task.__name__:20s} {tot / n :2.2f}')\n",
"\n",
"eval_losses(n=20)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"quiz_task 3.54\n",
"answer_task 3.54\n",
"reply_task 2.99\n",
"fill_gap_task 2.53\n",
"assemble_task 1.57\n",
"translate_task 2.15\n",
"headline_task 1.55\n",
"paraphrase_task 0.80\n",
"ask_task 0.80\n",
"rsg_task 0.93\n",
"simplify_task 0.88\n",
"comprehend_task 0.26\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qhCPxgnlViwF"
},
"source": [
"One round of fine-tuning lasts until Colab stops\n",
"* May 11: loss on 10 tasks goes from 1.89 to 1.69 roughly\n",
"* May 12: loss on 10 tasks stays about 1.69 for 15K batches\n",
"* May 13: loss still flat about 1.69-1.67. Change acc.steps from 8 to 32, then after time loss goes to 1.62-1.64\n",
"* May 27: use a larger GPU (colab pro), batch 4 instead of 2. Loss is 1.27-1.24 because of more `<pad>` tokens -> after a night, 1.21-1.22\n",
"* Add the quiz task, loss goes back up to 1.22-12.4, but decreases to 1.19 after some training\n",
"* reduce batch to 2, loss goes back to 1.63 (but now with a more difficult quiz task)\n",
"* May 28: add russian SuperGlue, loss goes up to 1.73, then down to 1.50 - 1.55"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000,
"referenced_widgets": [
"0a0dec11e0354f0db5871ed73cacb3f1",
"0ee6cc753f584c1785ff982167ad9c04",
"796c08d68cc6443dbc5b4c9100cc7f0a",
"c2fb21304bea46ddb9e582c92231231d",
"78877bf42425411485ba4a334ca8f202",
"4e392d8d865f4a61b85c64c7f14bb7fd",
"2e8e2326727a4e48b24b4c47c9fb1f61",
"6a1d007fdb4447a1a8511e4eb450285b"
]
},
"id": "Eted5RrGEqv_",
"outputId": "a1fb788f-ddbf-46c8-d2f5-80f092793fcd"
},
"source": [
"model.train();\n",
"batch_size = 2\n",
"max_len = 1024\n",
"epochs = 5\n",
"accumulation_steps = 32\n",
"save_steps = 5000\n",
"\n",
"window = 5000\n",
"ewm = 0\n",
"\n",
"tq = trange(int(100000000 / batch_size))\n",
"cleanup()\n",
"\n",
"for i in tq:\n",
" xx = []\n",
" yy = []\n",
" for _ in range(batch_size):\n",
" xxx, yyy = random.choice(TASKS + [rsg_task] * 3)() # rsg is more various, increase its occurrence 4-fold\n",
" xx.append(xxx)\n",
" yy.append(yyy)\n",
"\n",
" try:\n",
" x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)\n",
" y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)\n",
" # do not force the model to predict pad tokens\n",
" y.input_ids[y.input_ids==0] = -100\n",
"\n",
" loss = model(\n",
" input_ids=x.input_ids,\n",
" attention_mask=x.attention_mask,\n",
" labels=y.input_ids,\n",
" decoder_attention_mask=y.attention_mask,\n",
" return_dict=True\n",
" ).loss\n",
" loss.backward()\n",
" # print('ok')\n",
" except RuntimeError:\n",
" print([xxx.split(' |')[0] for xxx in xx])\n",
" loss = None\n",
" cleanup()\n",
" continue\n",
"\n",
" w = 1 / min(i+1, window)\n",
" ewm = ewm * (1-w) + loss.item() * w\n",
" tq.set_description(f'loss: {ewm}')\n",
"\n",
" if i % accumulation_steps == 0:\n",
" optimizer.step()\n",
" optimizer.zero_grad()\n",
" cleanup()\n",
"\n",
" if i % window == 0 and i > 0:\n",
" print(ewm)\n",
" cleanup()\n",
" # optimizer.param_groups[0]['lr'] *= 0.999\n",
" if i % save_steps == 0 and i > 0:\n",
" model.save_pretrained(MODEL_NAME)\n",
" tokenizer.save_pretrained(MODEL_NAME)\n",
" print('saving...', i, optimizer.param_groups[0]['lr'])\n",
"\n",
" try:\n",
" optimizer.step()\n",
" optimizer.zero_grad()\n",
" eval_losses()\n",
" optimizer.step()\n",
" optimizer.zero_grad()\n",
" except RuntimeError:\n",
" cleanup()"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0a0dec11e0354f0db5871ed73cacb3f1",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=50000000.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"['RUSSE', 'translate en-ru']\n",
"['translate en-ru', 'RuCoS']\n",
"1.5169176257705035\n",
"saving... 5000 1e-05\n",
"quiz_task 3.92\n",
"answer_task 3.97\n",
"reply_task 3.50\n",
"fill_gap_task 3.29\n",
"assemble_task 2.47\n",
"translate_task 1.94\n",
"headline_task 2.13\n",
"paraphrase_task 1.58\n",
"ask_task 1.84\n",
"rsg_task 1.37\n",
"simplify_task 0.64\n",
"comprehend_task 0.39\n",
"1.5029236435164788\n",
"saving... 10000 1e-05\n",
"quiz_task 4.45\n",
"answer_task 3.96\n",
"reply_task 3.64\n",
"fill_gap_task 3.94\n",
"assemble_task 2.74\n",
"translate_task 2.51\n",
"headline_task 2.03\n",
"paraphrase_task 1.50\n",
"ask_task 1.35\n",
"rsg_task 1.62\n",
"simplify_task 0.89\n",
"comprehend_task 0.76\n",
"['paraphrase', 'translate ru-en']\n",
"1.5008431054809206\n",
"saving... 15000 1e-05\n",
"quiz_task 4.35\n",
"answer_task 4.38\n",
"reply_task 3.12\n",
"fill_gap_task 3.98\n",
"assemble_task 2.12\n",
"translate_task 1.32\n",
"headline_task 2.08\n",
"paraphrase_task 1.02\n",
"ask_task 1.29\n",
"rsg_task 1.29\n",
"simplify_task 1.03\n",
"comprehend_task 0.97\n",
"['translate ru-en', 'headline']\n",
"['simplify', 'translate ru-en']\n",
"1.4978644233004048\n",
"saving... 20000 1e-05\n",
"quiz_task 4.57\n",
"answer_task 4.34\n",
"reply_task 3.19\n",
"fill_gap_task 3.07\n",
"assemble_task 1.56\n",
"translate_task 2.00\n",
"headline_task 1.96\n",
"paraphrase_task 1.59\n",
"ask_task 1.30\n",
"rsg_task 1.48\n",
"simplify_task 1.17\n",
"comprehend_task 0.50\n",
"['headline', 'translate en-ru']\n",
"1.4910498812110424\n",
"saving... 25000 1e-05\n",
"quiz_task 4.22\n",
"answer_task 3.88\n",
"reply_task 3.63\n",
"fill_gap_task 3.64\n",
"assemble_task 1.97\n",
"translate_task 1.69\n",
"headline_task 2.20\n",
"paraphrase_task 1.19\n",
"ask_task 1.50\n",
"rsg_task 1.71\n",
"simplify_task 0.85\n",
"comprehend_task 1.18\n",
"['translate en-ru', 'RUSSE']\n",
"1.503235560207827\n",
"saving... 30000 1e-05\n",
"quiz_task 4.89\n",
"answer_task 4.35\n",
"reply_task 2.45\n",
"fill_gap_task 2.78\n",
"assemble_task 2.29\n",
"translate_task 1.95\n",
"headline_task 2.13\n",
"paraphrase_task 1.27\n",
"ask_task 1.71\n",
"rsg_task 1.05\n",
"simplify_task 1.00\n",
"comprehend_task 0.55\n",
"['paraphrase', 'translate ru-en']\n",
"1.4912955104861614\n",
"saving... 35000 1e-05\n",
"quiz_task 4.44\n",
"answer_task 4.37\n",
"reply_task 3.35\n",
"fill_gap_task 3.04\n",
"assemble_task 1.94\n",
"translate_task 2.14\n",
"headline_task 1.82\n",
"paraphrase_task 1.13\n",
"ask_task 1.40\n",
"rsg_task 0.64\n",
"simplify_task 1.21\n",
"comprehend_task 0.52\n",
"['translate ru-en', 'assemble']\n",
"['translate en-ru', 'comprehend']\n",
"1.5052568758166156\n",
"saving... 40000 1e-05\n",
"quiz_task 3.97\n",
"answer_task 4.52\n",
"reply_task 3.34\n",
"fill_gap_task 3.75\n",
"assemble_task 1.73\n",
"translate_task 2.32\n",
"headline_task 2.04\n",
"paraphrase_task 1.09\n",
"ask_task 1.39\n",
"rsg_task 1.98\n",
"simplify_task 1.20\n",
"comprehend_task 0.38\n",
"1.4888850910856006\n",
"saving... 45000 1e-05\n",
"quiz_task 4.13\n",
"answer_task 4.10\n",
"reply_task 3.22\n",
"fill_gap_task 3.01\n",
"assemble_task 2.53\n",
"translate_task 2.95\n",
"headline_task 1.48\n",
"paraphrase_task 1.48\n",
"ask_task 1.23\n",
"rsg_task 1.44\n",
"simplify_task 1.20\n",
"comprehend_task 0.41\n",
"1.4977066553639546\n",
"saving... 50000 1e-05\n",
"quiz_task 3.66\n",
"answer_task 3.89\n",
"reply_task 3.20\n",
"fill_gap_task 3.16\n",
"assemble_task 2.11\n",
"translate_task 2.37\n",
"headline_task 1.97\n",
"paraphrase_task 1.54\n",
"ask_task 1.32\n",
"rsg_task 1.25\n",
"simplify_task 1.09\n",
"comprehend_task 0.32\n",
"1.4945176678043737\n",
"saving... 55000 1e-05\n",
"quiz_task 3.06\n",
"answer_task 3.73\n",
"reply_task 3.09\n",
"fill_gap_task 3.54\n",
"assemble_task 2.38\n",
"translate_task 2.19\n",
"headline_task 1.76\n",
"paraphrase_task 1.13\n",
"ask_task 1.41\n",
"rsg_task 2.23\n",
"simplify_task 1.22\n",
"comprehend_task 0.63\n",
"['translate en-ru', 'RuCoS']\n",
"['translate en-ru', 'RUSSE']\n",
"1.4950395862961616\n",
"saving... 60000 1e-05\n",
"quiz_task 5.22\n",
"answer_task 3.53\n",
"reply_task 2.55\n",
"fill_gap_task 3.27\n",
"assemble_task 2.24\n",
"translate_task 2.36\n",
"headline_task 2.08\n",
"paraphrase_task 1.65\n",
"ask_task 1.35\n",
"rsg_task 0.68\n",
"simplify_task 1.19\n",
"comprehend_task 0.57\n",
"['translate en-ru', 'simplify']\n",
"1.4971573782699532\n",
"saving... 65000 1e-05\n",
"quiz_task 4.11\n",
"answer_task 3.65\n",
"reply_task 3.62\n",
"fill_gap_task 2.62\n",
"assemble_task 2.11\n",
"translate_task 2.00\n",
"headline_task 2.57\n",
"paraphrase_task 1.31\n",
"ask_task 1.68\n",
"rsg_task 1.63\n",
"simplify_task 1.09\n",
"comprehend_task 0.58\n",
"['paraphrase', 'translate ru-en']\n",
"['comprehend', 'translate en-ru']\n",
"1.4931211285863748\n",
"saving... 70000 1e-05\n",
"quiz_task 4.90\n",
"answer_task 3.77\n",
"reply_task 3.13\n",
"fill_gap_task 3.17\n",
"assemble_task 2.11\n",
"translate_task 2.31\n",
"headline_task 1.48\n",
"paraphrase_task 1.85\n",
"ask_task 1.02\n",
"rsg_task 1.60\n",
"simplify_task 1.19\n",
"comprehend_task 0.59\n",
"1.4789641368957018\n",
"saving... 75000 1e-05\n"
],
"name": "stdout"
},
{
"output_type": "error",
"ename": "ValueError",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-97-f7f6510afa64>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 62\u001b[0;31m \u001b[0meval_losses\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 63\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-90-6939c2814546>\u001b[0m in \u001b[0;36meval_losses\u001b[0;34m(n, max_len)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mxxx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0myyy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxxx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_tensors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'pt'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtruncation\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_length\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmax_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0myyy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_tensors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'pt'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtruncation\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_length\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmax_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m loss = model(\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2261\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0m_is_valid_text_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2262\u001b[0m raise ValueError(\n\u001b[0;32m-> 2263\u001b[0;31m \u001b[0;34m\"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2264\u001b[0m \u001b[0;34m\"or `List[List[str]]` (batch of pretokenized examples).\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2265\u001b[0m )\n",
"\u001b[0;31mValueError\u001b[0m: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "GBl75ZomJ07u"
},
"source": [
"model.save_pretrained(MODEL_NAME)\n",
"tokenizer.save_pretrained(MODEL_NAME)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "T9Ou6TOrMCr3"
},
"source": [
"!ls $MODEL_NAME"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "_nQ6HgBL3Q0h"
},
"source": [],
"execution_count": null,
"outputs": []
}
]
}
@alex2romanov
Copy link

Привет! Пишет ошибка. Хочу посмотреть на твой код.

@avidale
Copy link
Author

avidale commented Nov 30, 2023

Привет! Ошибка - у гитхабовского вебвьюера. Но если ты скачаешь этот файл и откроешь где-нибудь ещё, всё будет отлично видно.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment