Skip to content

Instantly share code, notes, and snippets.

@avidale
Created April 30, 2021 21:51
Show Gist options
  • Star 17 You must be signed in to star a gist
  • Fork 7 You must be signed in to fork a gist
  • Save avidale/44cd35bfcdaf8bedf51d97c468cc8001 to your computer and use it in GitHub Desktop.
Save avidale/44cd35bfcdaf8bedf51d97c468cc8001 to your computer and use it in GitHub Desktop.
create_rut5-base.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "create_rut5-base.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyO5k7Vc4zthTK1pkTfX5eNT",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"2d65b38f1ede49a0b4ae70b7e1f03359": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_aace2f95ba334bd48136c0caf5dca14c",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_745f0b9b29a0423d8759fb9f2c52cfad",
"IPY_MODEL_0c1bd6c7a83a479db23009b96ef9caf4"
]
}
},
"aace2f95ba334bd48136c0caf5dca14c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"745f0b9b29a0423d8759fb9f2c52cfad": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_ed1988f523184224bb1c8b6096ab31f1",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 4309802,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 4309802,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_7a16f7fd4b6d4b0e8eabfe1f126d79d1"
}
},
"0c1bd6c7a83a479db23009b96ef9caf4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_32c54722ab15498da4adc548eccf603f",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 4.31M/4.31M [01:19<00:00, 54.1kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_3a11705472864ea9a248b92c689f67e1"
}
},
"ed1988f523184224bb1c8b6096ab31f1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"7a16f7fd4b6d4b0e8eabfe1f126d79d1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"32c54722ab15498da4adc548eccf603f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"3a11705472864ea9a248b92c689f67e1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5a1a3f0010324df4b4dd42289eb258f4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_983350c43d3e428b8573c8a7f6ed43ec",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_2467e4daed9945e9bad79d0bcec4efd2",
"IPY_MODEL_1f8a7f5fe43b41db8fabc2ffad127423"
]
}
},
"983350c43d3e428b8573c8a7f6ed43ec": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"2467e4daed9945e9bad79d0bcec4efd2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_a6590375d193488abdbb9fe6bb46f026",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 65,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 65,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_1a61b2d84b654b18bfafaf139401dabd"
}
},
"1f8a7f5fe43b41db8fabc2ffad127423": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_fa9c3c9b135a4440b5c0fcbce359b151",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 65.0/65.0 [00:00<00:00, 127B/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_ad57c3a7bb5542398cc481744bf6fe58"
}
},
"a6590375d193488abdbb9fe6bb46f026": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"1a61b2d84b654b18bfafaf139401dabd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"fa9c3c9b135a4440b5c0fcbce359b151": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"ad57c3a7bb5542398cc481744bf6fe58": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5a3980a7d3434549aeb0276ff25dfc37": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_12935e9215b841dda2e40a1c3e497726",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_e342244246e24e858d5036b4eef040b8",
"IPY_MODEL_50651c9c71f849569d70d7fe037a2c2f"
]
}
},
"12935e9215b841dda2e40a1c3e497726": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"e342244246e24e858d5036b4eef040b8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_81201911fdff4840b391ab5cbc6c2874",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 376,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 376,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_73e3647a17ef45bb8080ab67c2e70120"
}
},
"50651c9c71f849569d70d7fe037a2c2f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_531ee8dbd2354128b842d82db5ec85ac",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 376/376 [01:17<00:00, 4.82B/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_ae0027cb61cb4f41b3ee8ab127b3a7b7"
}
},
"81201911fdff4840b391ab5cbc6c2874": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"73e3647a17ef45bb8080ab67c2e70120": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"531ee8dbd2354128b842d82db5ec85ac": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"ae0027cb61cb4f41b3ee8ab127b3a7b7": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"7bf6adb3c017459f85a2399ede31edb2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_1fb43db4b3d74b25b2e0e7f3daa5f4c1",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_56ffd44ce9dd4cfb902b3e5d785985e5",
"IPY_MODEL_3146e7ba8c0d44aa8c07f31006b8dde5"
]
}
},
"1fb43db4b3d74b25b2e0e7f3daa5f4c1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"56ffd44ce9dd4cfb902b3e5d785985e5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_737f1d75176f403d95d7e0ea18933d51",
"_dom_classes": [],
"description": "100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1000000,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1000000,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_faebe32278c04527abd995a4da588c13"
}
},
"3146e7ba8c0d44aa8c07f31006b8dde5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_a630c1d2b88d4f9dbda38020f7eaa287",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1000000/1000000 [08:19<00:00, 2003.52it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_9181d9829eb24887b18f5e16364b6584"
}
},
"737f1d75176f403d95d7e0ea18933d51": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"faebe32278c04527abd995a4da588c13": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"a630c1d2b88d4f9dbda38020f7eaa287": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"9181d9829eb24887b18f5e16364b6584": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"016ce230728a4da28a8992a571807576": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_2f45540c744a42129bf0a1254ed2c13d",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_7ec288558b8c43aba1abb9e5dd4612d3",
"IPY_MODEL_a59a755f6ccd48a0abdb35574713dbc9"
]
}
},
"2f45540c744a42129bf0a1254ed2c13d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"7ec288558b8c43aba1abb9e5dd4612d3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_029cc249595640dabca1cda08dd8611b",
"_dom_classes": [],
"description": "100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1000000,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1000000,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_9b41f256d17c4bdcaebf6dd5f42ec5f9"
}
},
"a59a755f6ccd48a0abdb35574713dbc9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_0baf1aa3115b453d9caa7160da9fc398",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1000000/1000000 [03:50<00:00, 4340.07it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_5c2b2e9f6dfc4771a0f8671db975efd6"
}
},
"029cc249595640dabca1cda08dd8611b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"9b41f256d17c4bdcaebf6dd5f42ec5f9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"0baf1aa3115b453d9caa7160da9fc398": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"5c2b2e9f6dfc4771a0f8671db975efd6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b0e9eaee892b4822985a6d0f20e41f07": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_c539d3b872284fb799b03132fac6dc14",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_c1c374868d4941929a85c2a00f2f2860",
"IPY_MODEL_a1089e2c170549539cb9844e6e4e5472"
]
}
},
"c539d3b872284fb799b03132fac6dc14": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"c1c374868d4941929a85c2a00f2f2860": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_cac56c0a1b6c4d16ba479c3b7301961b",
"_dom_classes": [],
"description": "100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1000000,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1000000,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_f75eb6fb8b5543eb812f5731d83009c9"
}
},
"a1089e2c170549539cb9844e6e4e5472": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_0f0672563dd54404affe7ce0ef899ce1",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1000000/1000000 [03:41<00:00, 4514.84it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_8ef3b35ab56144c69768152099b623fe"
}
},
"cac56c0a1b6c4d16ba479c3b7301961b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"f75eb6fb8b5543eb812f5731d83009c9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"0f0672563dd54404affe7ce0ef899ce1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"8ef3b35ab56144c69768152099b623fe": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b432b120350b42388447dcfcf959d673": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_5660d17e6c0e40b7bee6dc3d1af46f34",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_9dcf8c4bef5342aba837077c4904d852",
"IPY_MODEL_d9a80cc59ada42fe908726abd05942bf"
]
}
},
"5660d17e6c0e40b7bee6dc3d1af46f34": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9dcf8c4bef5342aba837077c4904d852": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_847d7a0c9f4a418e93484cd9bf8b8e0a",
"_dom_classes": [],
"description": "100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 220100,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 220100,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_6c74fe4ffc4f4013bf73d00212e5775a"
}
},
"d9a80cc59ada42fe908726abd05942bf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_5a17271685624ad3bd2aa9c88504f969",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 220100/220100 [01:05<00:00, 3338.02it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_1780d72d89b5449eaa0dcea9c595b6d7"
}
},
"847d7a0c9f4a418e93484cd9bf8b8e0a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"6c74fe4ffc4f4013bf73d00212e5775a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5a17271685624ad3bd2aa9c88504f969": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"1780d72d89b5449eaa0dcea9c595b6d7": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/avidale/44cd35bfcdaf8bedf51d97c468cc8001/create_rut5-base.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "oh2xfITDhN2u"
},
"source": [
"The goal of this notebook is to create a Russian version of mT5 model out of the multilingual one. "
]
},
{
"cell_type": "code",
"metadata": {
"id": "BoiF06nfGvtW"
},
"source": [
"!pip install transformers sentencepiece"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "zcVexj3Ye6X3"
},
"source": [
"# Removing the unused vocabulary"
]
},
{
"cell_type": "code",
"metadata": {
"id": "X99M7UWoHC9k"
},
"source": [
"from transformers import T5ForConditionalGeneration, T5Tokenizer\n",
"import torch"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201,
"referenced_widgets": [
"2d65b38f1ede49a0b4ae70b7e1f03359",
"aace2f95ba334bd48136c0caf5dca14c",
"745f0b9b29a0423d8759fb9f2c52cfad",
"0c1bd6c7a83a479db23009b96ef9caf4",
"ed1988f523184224bb1c8b6096ab31f1",
"7a16f7fd4b6d4b0e8eabfe1f126d79d1",
"32c54722ab15498da4adc548eccf603f",
"3a11705472864ea9a248b92c689f67e1",
"5a1a3f0010324df4b4dd42289eb258f4",
"983350c43d3e428b8573c8a7f6ed43ec",
"2467e4daed9945e9bad79d0bcec4efd2",
"1f8a7f5fe43b41db8fabc2ffad127423",
"a6590375d193488abdbb9fe6bb46f026",
"1a61b2d84b654b18bfafaf139401dabd",
"fa9c3c9b135a4440b5c0fcbce359b151",
"ad57c3a7bb5542398cc481744bf6fe58",
"5a3980a7d3434549aeb0276ff25dfc37",
"12935e9215b841dda2e40a1c3e497726",
"e342244246e24e858d5036b4eef040b8",
"50651c9c71f849569d70d7fe037a2c2f",
"81201911fdff4840b391ab5cbc6c2874",
"73e3647a17ef45bb8080ab67c2e70120",
"531ee8dbd2354128b842d82db5ec85ac",
"ae0027cb61cb4f41b3ee8ab127b3a7b7"
]
},
"id": "7OnBRq8pHFDN",
"outputId": "c078cc6d-01b2-47f8-aa2f-ad34b9eb4c1b"
},
"source": [
"tokenizer = T5Tokenizer.from_pretrained(\"google/mt5-base\")\n",
"tokenizer"
],
"execution_count": 4,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2d65b38f1ede49a0b4ae70b7e1f03359",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4309802.0, style=ProgressStyle(descript…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a1a3f0010324df4b4dd42289eb258f4",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a3980a7d3434549aeb0276ff25dfc37",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=376.0, style=ProgressStyle(description_…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"PreTrainedTokenizer(name_or_path='google/mt5-base', vocab_size=250100, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})"
]
},
"metadata": {
"tags": []
},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "HkXHkM6OHJcH",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "e3a09b2d-82eb-4cfa-a6bb-0d37466ff821"
},
"source": [
"model = T5ForConditionalGeneration.from_pretrained('google/mt5-base')"
],
"execution_count": 37,
"outputs": [
{
"output_type": "stream",
"text": [
"You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "YMItls1shI3-"
},
"source": [
"Our tokenizer contains 250K tokens, "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "U0vhvaP8HKm8",
"outputId": "d48a92ad-0358-47bf-a5a0-583af98f08c6"
},
"source": [
"print(tokenizer.vocab_size)"
],
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
"250100\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hX8pzm4nhhMt"
},
"source": [
"The model has 582M parameters. "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hz6Bv4tZIsX5",
"outputId": "e40c6a94-c9d2-4839-b1dc-b386056dc1a0"
},
"source": [
"def msize(m):\n",
" return sum(p.numel() for p in m.parameters())\n",
"\n",
"original_size = msize(model)\n",
"print(msize(model))\n",
"print(msize(model.shared))\n",
"print('encoder')\n",
"print(msize(model.encoder))\n",
"print(msize(model.encoder.block))\n",
"print('decoder')\n",
"print(msize(model.decoder))\n",
"print(msize(model.decoder.block))\n",
"print(msize(model.lm_head))"
],
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": [
"582401280\n",
"192086016\n",
"encoder\n",
"277040256\n",
"84953472\n",
"decoder\n",
"305361024\n",
"113274240\n",
"192086016\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "18ckhebWLLra"
},
"source": [
"Input and output embeddings are 66% of the whole model"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hmvmyYsyHh2s",
"outputId": "aa72c8fb-e202-4460-efe9-fd28a821f1e2"
},
"source": [
"print(msize(model.shared) / msize(model))\n",
"print(msize(model.lm_head) / msize(model))"
],
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"text": [
"0.32981729710484153\n",
"0.32981729710484153\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "amFXHV9OL9SU"
},
"source": [
"# Determine the new tokens"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "NfeGCTv5Vvmu"
},
"source": [
"Take a file from https://wortschatz.uni-leipzig.de/en/download/Russian as a representation of Russian language. It contains 1M sentences. \n",
"\n",
"Also take a similar representation of English, because we want our model to be bilingual, and English shares few tokens with Russian."
]
},
{
"cell_type": "code",
"metadata": {
"id": "WxsNhpKfME5W"
},
"source": [
"!wget http://pcai056.informatik.uni-leipzig.de/downloads/corpora/rus-ru_web-public_2019_1M.tar.gz\n",
"!tar -xsvf rus-ru_web-public_2019_1M.tar.gz"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "XNHwPMCHiRhr"
},
"source": [
"!wget http://pcai056.informatik.uni-leipzig.de/downloads/corpora/eng-com_web-public_2018_1M.tar.gz\n",
"!tar -xsvf eng-com_web-public_2018_1M.tar.gz"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "gqjTHFJIiZTk"
},
"source": [
"Let us look at the sentences"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"id": "IoJlXMw_M7pT",
"outputId": "5d1ca6d0-4153-4290-db8e-5e0ec29e6f80"
},
"source": [
"import pandas as pd\n",
"pd.options.display.max_colwidth = 300\n",
"import csv\n",
"fname = 'rus-ru_web-public_2019_1M/rus-ru_web-public_2019_1M-sentences.txt'\n",
"df_ru = pd.read_csv(fname, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
"df_ru.columns = ['idx', 'text']\n",
"df_ru.sample(5)"
],
"execution_count": 18,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>idx</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>53482</th>\n",
" <td>53483</td>\n",
" <td>Больше Лена ничего говорить не стала, не до этого было.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>937136</th>\n",
" <td>937137</td>\n",
" <td>Чиновники наши не беднеют, а при наших доходах не разбогатеешь точно».</td>\n",
" </tr>\n",
" <tr>\n",
" <th>401463</th>\n",
" <td>401464</td>\n",
" <td>Кроме обязательной почты, сберкассы и трех магазинов РайПО, здесь функционируют объекты социальной инфраструктуры.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>238656</th>\n",
" <td>238657</td>\n",
" <td>Доставка по России и ближнему зарубежью.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>295958</th>\n",
" <td>295959</td>\n",
" <td>Здесь раскинулась долина, в центре которой течет поток зеленовато‑голубого цвета шириной в несколько десятков метров, светящийся в темноте, как большой освещаемый изнутри бассейн.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" idx text\n",
"53482 53483 Больше Лена ничего говорить не стала, не до этого было.\n",
"937136 937137 Чиновники наши не беднеют, а при наших доходах не разбогатеешь точно».\n",
"401463 401464 Кроме обязательной почты, сберкассы и трех магазинов РайПО, здесь функционируют объекты социальной инфраструктуры.\n",
"238656 238657 Доставка по России и ближнему зарубежью.\n",
"295958 295959 Здесь раскинулась долина, в центре которой течет поток зеленовато‑голубого цвета шириной в несколько десятков метров, светящийся в темноте, как большой освещаемый изнутри бассейн."
]
},
"metadata": {
"tags": []
},
"execution_count": 18
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"id": "V-Uc7nbziyXp",
"outputId": "c14d04a8-5b09-4977-aa10-9ed34a2ebfba"
},
"source": [
"fname = 'eng-com_web-public_2018_1M/eng-com_web-public_2018_1M-sentences.txt'\n",
"df_en = pd.read_csv(fname, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
"df_en.columns = ['idx', 'text']\n",
"df_en.sample(5)"
],
"execution_count": 19,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>idx</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>536627</th>\n",
" <td>536628</td>\n",
" <td>My two crabby old men cats were not so impressed, but Miss Agnes DeMitten (aka Endora) was checking behind the monitor.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>783178</th>\n",
" <td>783179</td>\n",
" <td>There is another lightweight distribution, in the Ubuntu family called Xubuntu, but Lubuntu is far more efficient when it comes to memory usage.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>447801</th>\n",
" <td>447802</td>\n",
" <td>It's perfectly symmetric -- client to server, WinFS to Sharepoint.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>207171</th>\n",
" <td>207172</td>\n",
" <td>\"Everything we asked for we got from Judge Lasnik,\" he said, and called on President Trump to make it \"unlawful for anyone to make this information available for anyone\".</td>\n",
" </tr>\n",
" <tr>\n",
" <th>595030</th>\n",
" <td>595031</td>\n",
" <td>\"People who are willing to do something like this, especially if the shark hasn't made it, are brought to justice,\" Spellman said.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" idx text\n",
"536627 536628 My two crabby old men cats were not so impressed, but Miss Agnes DeMitten (aka Endora) was checking behind the monitor.\n",
"783178 783179 There is another lightweight distribution, in the Ubuntu family called Xubuntu, but Lubuntu is far more efficient when it comes to memory usage.\n",
"447801 447802 It's perfectly symmetric -- client to server, WinFS to Sharepoint.\n",
"207171 207172 \"Everything we asked for we got from Judge Lasnik,\" he said, and called on President Trump to make it \"unlawful for anyone to make this information available for anyone\".\n",
"595030 595031 \"People who are willing to do something like this, especially if the shark hasn't made it, are brought to justice,\" Spellman said."
]
},
"metadata": {
"tags": []
},
"execution_count": 19
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zhkWqfdNjNww"
},
"source": [
"Count the tokens that the current model uses for representing the sentences. "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 115,
"referenced_widgets": [
"7bf6adb3c017459f85a2399ede31edb2",
"1fb43db4b3d74b25b2e0e7f3daa5f4c1",
"56ffd44ce9dd4cfb902b3e5d785985e5",
"3146e7ba8c0d44aa8c07f31006b8dde5",
"737f1d75176f403d95d7e0ea18933d51",
"faebe32278c04527abd995a4da588c13",
"a630c1d2b88d4f9dbda38020f7eaa287",
"9181d9829eb24887b18f5e16364b6584",
"016ce230728a4da28a8992a571807576",
"2f45540c744a42129bf0a1254ed2c13d",
"7ec288558b8c43aba1abb9e5dd4612d3",
"a59a755f6ccd48a0abdb35574713dbc9",
"029cc249595640dabca1cda08dd8611b",
"9b41f256d17c4bdcaebf6dd5f42ec5f9",
"0baf1aa3115b453d9caa7160da9fc398",
"5c2b2e9f6dfc4771a0f8671db975efd6"
]
},
"id": "lmzSON9iM_yb",
"outputId": "37d26a05-0566-444d-b6da-37506f197ea7"
},
"source": [
"from collections import Counter\n",
"from tqdm.auto import tqdm, trange\n",
"\n",
"cnt_ru = Counter()\n",
"for text in tqdm(df_ru.text):\n",
" cnt_ru.update(tokenizer.encode(text))\n",
"\n",
"cnt_en = Counter()\n",
"for text in tqdm(df_en.text):\n",
" cnt_en.update(tokenizer.encode(text))"
],
"execution_count": 20,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7bf6adb3c017459f85a2399ede31edb2",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=1000000.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "016ce230728a4da28a8992a571807576",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=1000000.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 66,
"referenced_widgets": [
"b0e9eaee892b4822985a6d0f20e41f07",
"c539d3b872284fb799b03132fac6dc14",
"c1c374868d4941929a85c2a00f2f2860",
"a1089e2c170549539cb9844e6e4e5472",
"cac56c0a1b6c4d16ba479c3b7301961b",
"f75eb6fb8b5543eb812f5731d83009c9",
"0f0672563dd54404affe7ce0ef899ce1",
"8ef3b35ab56144c69768152099b623fe"
]
},
"id": "l8UdvvAYlJ6_",
"outputId": "b3dd44a3-12c2-4b5d-d247-9e8d341d766d"
},
"source": [
"cnt_en = Counter()\n",
"for text in tqdm(df_en.text):\n",
" cnt_en.update(tokenizer.encode(text))"
],
"execution_count": 23,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b0e9eaee892b4822985a6d0f20e41f07",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=1000000.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sTzND5F1OkEY"
},
"source": [
"The tokens that are ever used with Russian are 23% of the whole vocabulary. With English, it is 27%.\n",
"\n",
"Surprisingly, there is more than 50% overlap between the vocabularies. Perhaps, this is because in Russian texts there are occasionally English words or other words with latin alphabet. "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "M07fj3z0NWiy",
"outputId": "b335c276-1ab1-4465-cbb8-8a9fa5d08cff"
},
"source": [
"print(len(cnt_ru), len(cnt_ru)/tokenizer.vocab_size)\n",
"print(len(cnt_en), len(cnt_en)/tokenizer.vocab_size)\n",
"common = len(set(cnt_ru.keys()).intersection(set(cnt_en.keys())))\n",
"print(common, common / len(cnt_ru))"
],
"execution_count": 58,
"outputs": [
{
"output_type": "stream",
"text": [
"58438 0.23365853658536587\n",
"67920 0.2715713714514194\n",
"33211 0.5683117149799788\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2ULUmyllmNA0"
},
"source": [
"For both English and Russian, 10K tokens cover about 95% of the vocabulary, and 20K - about 99%. "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kNudkAe5NbKT",
"outputId": "de2363c8-acec-4048-fc6a-5041e675ab13"
},
"source": [
"print('ru')\n",
"for top in 10_000, 20_000, 30_000:\n",
" print(top, sum(v for k, v in cnt_ru.most_common(top)) / sum(cnt_ru.values()))\n",
"print('en')\n",
"for top in 10_000, 20_000, 30_000:\n",
" print(top, sum(v for k, v in cnt_en.most_common(top)) / sum(cnt_en.values()))"
],
"execution_count": 25,
"outputs": [
{
"output_type": "stream",
"text": [
"ru\n",
"10000 0.9645064095240437\n",
"20000 0.9948845835370821\n",
"30000 0.9982199641222749\n",
"en\n",
"10000 0.9531899764307693\n",
"20000 0.9840809828270257\n",
"30000 0.9937869259525808\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0N_D37J3lbqr"
},
"source": [
"Remember the old vocabulary, because we are going to replace it soon!"
]
},
{
"cell_type": "code",
"metadata": {
"id": "9RzGibfZQbgP"
},
"source": [
"old_voc = tokenizer.get_vocab()\n",
"old_inv_voc = {v: k for k, v in old_voc.items()}"
],
"execution_count": 27,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "rKwEQtbRljiC"
},
"source": [
"Look at the most used tokens. They are mostly service words or prefixes."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Y8oL4rL8QZ8f",
"outputId": "a8c56a5f-d5ba-4da1-80fe-4038d86f9efe"
},
"source": [
"print(tokenizer.convert_ids_to_tokens([k for k, v in cnt_ru.most_common(30)]))\n",
"print(tokenizer.convert_ids_to_tokens([k for k, v in cnt_en.most_common(30)]))"
],
"execution_count": 30,
"outputs": [
{
"output_type": "stream",
"text": [
"['▁', ',', '</s>', '.', 'и', '▁в', 'а', 'е', '▁не', '▁на', '▁с', 'я', '-', 'ы', '▁по', '▁что', 'у', 'о', 'ом', 'ов', 'ой', '▁за', '▁от', '▁это', '▁В', 'й', '▁у', '▁как', 'ть', '▁«']\n",
"['▁', '</s>', '.', '▁the', ',', 's', '▁to', '▁and', 'a', '▁of', '▁in', '▁is', '▁I', '’', '▁that', 'ed', '▁for', '-', 'ing', \"'\", '▁you', '▁it', '▁with', '▁on', 'ly', 'y', '▁be', '▁The', '▁as', '▁are']\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AwwPWiO3Po1x"
},
"source": [
"We try the following composition of vocabulary:\n",
"* 1K of top tokens of the original tokenizer (just in case)\n",
"* Top 10K of the English vocabulary\n",
"* Top 20K of the Russian vocabulary (or more, to make the total number of tokens 30K)\n",
"* 100 special tokens that T5 uses\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "J-aSMIB1Pxvh",
"outputId": "ee53265d-6192-40df-e98c-67b3ca11b285"
},
"source": [
"new_tokens = set(range(1000))\n",
"for i, (k, v) in enumerate(cnt_en.most_common(10_000)):\n",
" if k not in new_tokens:\n",
" new_tokens.add(k)\n",
"for i, (k, v) in enumerate(cnt_ru.most_common(25_000)):\n",
" if len(new_tokens) == 29_900:\n",
" print(i, 'Russan tokens are included')\n",
" break\n",
" if k not in new_tokens:\n",
" new_tokens.add(k)\n",
"\n",
"for t in range(tokenizer.vocab_size - 100, tokenizer.vocab_size):\n",
" new_tokens.add(t)\n",
"\n",
"print(len(new_tokens))\n",
"kept_ids = sorted(new_tokens)"
],
"execution_count": 39,
"outputs": [
{
"output_type": "stream",
"text": [
"20843 Russan tokens are included\n",
"30000\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BLAFLhrDoD4U"
},
"source": [
"The new vocabulary is only 12% of the original one. "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "q21bC7tpTyuW",
"outputId": "bed03ca4-c652-4d9b-cb4e-d2caceecc51e"
},
"source": [
"len(kept_ids) / tokenizer.vocab_size"
],
"execution_count": 40,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.11995201919232307"
]
},
"metadata": {
"tags": []
},
"execution_count": 40
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "s9ZrtTdcRfN_"
},
"source": [
"The plot shows that the tokens that were more frequent in the original vocabulary more frequently get into the new vocabulary (so that the curve bends upward). "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 279
},
"id": "IAPmeDZmRDIf",
"outputId": "c5b4526f-cbd5-447c-f372-6869ce6d5324"
},
"source": [
"import matplotlib.pyplot as plt\n",
"plt.plot(kept_ids)\n",
"plt.xlabel('new id of token')\n",
"plt.ylabel('old id of token');"
],
"execution_count": 42,
"outputs": [
{
"output_type": "display_data",
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZgAAAEGCAYAAABYV4NmAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3dd3yV5d3H8c+PQNh7E8Age1QQw3LUVRXUinWPR1FR63zsVmur1rbPY2urrQNbBwoOcPKAVkVUcLLC3hI2EQgkEHYgye/541zRUxpCgJycnOT7fr3O69znutfvygn5cV/XdV+3uTsiIiJlrVq8AxARkcpJCUZERGJCCUZERGJCCUZERGJCCUZERGKierwDqCiaNWvmqamp8Q5DRCShzJo1a4u7Ny9unRJMkJqaSnp6erzDEBFJKGa25mDr1EQmIiIxoQQjIiIxoQQjIiIxoQQjIiIxoQQjIiIxEbMEY2btzGyymS02s0Vmdlcof9DMMs1sbnidG7XPvWaWYWbLzOycqPLBoSzDzO6JKu9gZtND+WtmlhzKa4bPGWF9aqzqKSIixYvlFUw+8HN37wEMBG43sx5h3WPu3ie83gMI664AegKDgRFmlmRmScBTwBCgB3Bl1HH+FI7VCdgKDA/lw4GtofyxsJ2IiJSjmCUYd9/g7rPD8g5gCZBSwi5DgbHunufuq4AMoH94Zbj7SnffB4wFhpqZAWcAb4b9RwEXRh1rVFh+EzgzbC8iIsG4Oet59MNlrMneFZPjl0sfTGiiOh6YHoruMLP5ZjbSzBqHshRgXdRu60PZwcqbAtvcPf+A8n87VlifG7Y/MK6bzSzdzNI3b958VHUUEUkkG3L38NPX5vH4Jxms3JKgCcbM6gFvAT9x9+3A00BHoA+wAfhrrGM4GHd/xt3T3D2tefNiZzoQEamU8vYXAvCXS3tzetcWMTlHTBOMmdUgklxecfe3Adx9k7sXuHsh8CyRJjCATKBd1O5tQ9nByrOBRmZW/YDyfztWWN8wbC8iIkBBeJpxjaTY9R7EchSZAc8DS9z90ajy1lGb/QhYGJYnAFeEEWAdgM7ADGAm0DmMGEsmMhBggkee9TwZuCTsPwwYH3WsYWH5EuAT17OhRUS+VVgY+ZOYVC12CSaWk12eBFwDLDCzuaHs10RGgfUBHFgN/BjA3ReZ2evAYiIj0G539wIAM7sDmAgkASPdfVE43t3AWDP7AzCHSEIjvL9kZhlADpGkJCIiQX5Rgonh+KeYJRh3/wIoLvL3Stjnj8Afiyl/r7j93H0l3zWxRZfvBS49nHhFRKqSgpBgqsXwCkZ38ouIVEGFHvsrGCUYEZEq6NsmskTs5BcRkYqroBz6YJRgRESqoC8ztgDQplGtmJ1DCUZEpIpZsXknI6as4LzvtaZTi/oxO48SjIhIFeLuPPTOYmpWr8YDF/Q49A5HQQlGRKQKeXf+Bj79ejN3ndmZFvVj1zwGSjAiIlVG1va9/Hb8Qnq3bch1J6bG/HxKMCIiVYC7c+/bC9izr4C/XtaH6kmx//OvBCMiUgVMXLSJj5dm8avB3ejUol65nFMJRkSkkttfUMgjE5dybLO65dI0VkQJRkSkkvvDu4tZsXkX953XPaazJx9ICUZEpBJ7PX0do6auYfjJHTize8tyPbcSjIhIJTVrTQ6/GbeQkzs149fndi/38yvBiIhUQrl79nPHq3No1bAWT151fLk2jRWJ5QPHREQkTv78wVKyduQx7rYTaVQnOS4x6ApGRKSS+WL5FsbMWMs1A4/huLaN4haHEoyISCUybWU2N4yaybHN6/GLc7rGNRYlGBGRSiJ7Zx53jZ1D20a1ee3mgdSrGd9eEPXBiIhUAgWFzs/fmMfWXfsZeV0/mtarGe+QdAUjIlIZPDppGVOWbea3P+xBzzYN4x0OoAQjIpLwJi/LYsSUFVye1o5rBh4T73C+pQQjIpLA0lfncOvLs+jWqkHMHyB2uJRgREQS1MLMXK5/cSZtGtbmpeH9qZNcsbrVlWBERBLQjr37ue2V2dSvWZ2XbhxAswrQqX+gipXuRETkkAoLnV+8MY/MbXt47eaBpDSqHe+QiqUrGBGRBPPcFyuZuGgT9w7pRlpqk3iHc1BKMCIiCWTm6hwembiMc3q2ZPjJHeIdTomUYEREEsTmHXnc/sps2jauw58uPg6z8p8h+XAowYiIJICCQueusXPI3bOfEVf3jdsMyYdDnfwiIhWcu/PQO4v4akU2f77kOLq3bhDvkEolZlcwZtbOzCab2WIzW2Rmd4XyJmY2ycyWh/fGodzM7HEzyzCz+WbWN+pYw8L2y81sWFT5CWa2IOzzuIXrxYOdQ0QkET3/xSpGTV3DTad04LK0dvEOp9Ri2USWD/zc3XsAA4HbzawHcA/wsbt3Bj4OnwGGAJ3D62bgaYgkC+ABYADQH3ggKmE8DdwUtd/gUH6wc4iIJJRnP1vJH/61hME9W3HPkPJ/7PHRiFmCcfcN7j47LO8AlgApwFBgVNhsFHBhWB4KjPaIaUAjM2sNnANMcvccd98KTAIGh3UN3H2auzsw+oBjFXcOEZGEMXrqav743hLO+15rnojTY4+PRrl08ptZKnA8MB1o6e4bwqqNQMuwnAKsi9ptfSgrqXx9MeWUcI4D47rZzNLNLH3z5s2HXzERkRh5adoa7h+/iB90b8Fjl/ehRlLijcmKecRmVg94C/iJu2+PXheuPDyW5y/pHO7+jLunuXta8+bNYxmGiEipzViVw+/fWczpXZvz9H+dQHL1xEsuEOMEY2Y1iCSXV9z97VC8KTRvEd6zQnkmEN171TaUlVTetpjyks4hIlKhrcnexU2j02nbuDZ/vSwxr1yKxHIUmQHPA0vc/dGoVROAopFgw4DxUeXXhtFkA4Hc0Mw1ETjbzBqHzv2zgYlh3XYzGxjOde0BxyruHCIiFdbOvHxuGp2OGbxwfT+a1K3497qUJJb3wZwEXAMsMLO5oezXwMPA62Y2HFgDXBbWvQecC2QAu4HrAdw9x8x+D8wM2z3k7jlh+TbgRaA28H54UcI5REQqpH35hdw1Zg4rNu9i9A39OaZp3XiHdNQs0kUhaWlpnp6eHu8wRKQKcnd+8tpcxs/9ht8P7ck1g1LjHVKpmdksd08rbl3iNu6JiFQST3+6gvFzv+GuMzsnVHI5FCUYEZE4+mDhRv78wTIu6N2G/z6zc7zDKVNKMCIicZK+OoefvjaX3u0a8edLjku4GykPRQlGRCQO5q/fxvUvzKR1w1o8e+0J1KqRFO+QypwSjIhIOVuyYTvXPD+DRnVr8MpNA2hRv1a8Q4oJJRgRkXKUkbWT/3puOnWSk3j1xoG0blg73iHFjBKMiEg5WZO9i6ufm4aZ8cqNA2jXpE68Q4opJRgRkXKQuW0Pl/9zGvvyC3nlxgEc27xevEOKOT3RUkQkxtZv3c31L8xkZ14+r/94EF1b1Y93SOVCCUZEJIZWbt7J5c9MI29/Ac8NS6NHm8R43HFZUIIREYmRhZm5XDtyBgBv3noiXVpWjSuXIuqDERGJgRmrcrjymWnUrpHEW1UwuYCuYEREytyUZVnc8vIs2jSqzUvDB5DSqPIORS6JEoyISBl6PX0d941bQJeW9Rl9Q3+a1qsZ75DiRglGRKSMPDJxKU9NXsEpnZvx1NV9aVCrRrxDiislGBGRo1RQ6Dz0ziJGTV3DFf3a8dDQXiRXVxe3EoyIyFHYs6+An70+l/cXbuSmUzpwz5DulW5W5COlBCMicoQ2bd/LLS/PYu66bfzmvO7ceMqx8Q6pQlGCERE5Ais27+Ta52eQs2sfI67qy5DvtY53SBWOEoyIyGGau24bw1+cCcAbtwyiV0rDOEdUMZUqwZjZiUBq9PbuPjpGMYmIVFiz1mzlv56bTrP6yYy6vn+VmLTySB0ywZjZS0BHYC5QEIodUIIRkSrl0683c9vLs2jZoCav3zKo0j4orKyU5gomDejh7h7rYEREKqoPFm7kzjGz6dSiPi9c10/JpRRKM1B7IdAq1oGIiFRU4+as5/ZXZ9MrpSFjbx5Iq4ZKLqVRmiuYZsBiM5sB5BUVuvsFMYtKRKSCePHLVTz4zmIGHduUZ4elUa+mxkaVVml+Ug/GOggRkYrG3RkxZQWPTFzG2T1a8sRVx1OzelK8w0ooh0ww7v6pmR0DdHb3j8ysDqCfsohUWu7O/7y3hGc/X8XQPm3466W9qZ6kqV8O1yF/YmZ2E/Am8M9QlAL8XyyDEhGJl4JC5zf/t5BnP1/FsEHH8NhlfZRcjlBpmshuB/oD0wHcfbmZtYhpVCIicZCXX8Cdr87hw8WbuPW0jvzqnK6YaV6xI1WaBJPn7vuKfshmVp3IfTAiIpXG3v0F3PHqbD5aksUDP+zB9Sd1iHdICa80132fmtmvgdpmdhbwBvDOoXYys5FmlmVmC6PKHjSzTDObG17nRq2718wyzGyZmZ0TVT44lGWY2T1R5R3MbHoof83MkkN5zfA5I6xPLc0PQkSqrtzd+xk+aiYfLcniQSWXMlOaBHMPsBlYAPwYeM/d7yvFfi8Cg4spf8zd+4TXewBm1gO4AugZ9hlhZklmlgQ8BQwBegBXhm0B/hSO1QnYCgwP5cOBraH8sbCdiEix1m/dzcX/+IoZq3L4y6W9uU7JpcyUJsE86O7Puvul7n4JMNLMXjnUTu7+GZBTyjiGAmPdPc/dVwEZRPp9+gMZ7r7S3fcBY4GhFmmvO4PI4AOAUcCFUccaFZbfBM40NaKKSDEWZubyoxFfsWn7XkbfMIBLTmgb75AqldIkmHZmdi9AaIZ6C1h+FOe8w8zmhya0xqEsBVgXtc36UHaw8qbANnfPP6D8344V1ueG7f+Dmd1sZulmlr558+ajqJKIJJrJy7K47J9TSU6qxlu3nsigjsX+mZCjUJoEcwPwvZBk3gU+dfcHj/B8TxOZOLMPsAH46xEep0y4+zPunubuac2bN49nKCJSjsbOWMuNo9Lp0Kwub992Il1a1o93SJXSQUeRmVnfqI9/J3IfzJdEOv37uvvswz2Zu2+KOv6zRBIWQCbQLmrTtqGMg5RnA43MrHq4SonevuhY68OIt4ZhexGp4tydRyd9zROfZHBql+Y8dXVfTf0SQyX9ZA+8uthKpKP9r0SGKZ9xuCczs9buviF8/BGRiTQBJgCvmtmjQBugMzADMKCzmXUgkjiuAK5ydzezycAlRPplhgHjo441DJga1n+imaBFZM++Au5+az4T5n3DFf3a8fsLe1FDN1DG1EETjLuffjQHNrMxwGlAMzNbDzwAnGZmfYgkqNVERqXh7ovM7HVgMZAP3O7uBeE4dwATiUxPM9LdF4VT3A2MNbM/AHOA50P588BLZpZBZJDBFUdTDxFJfLl79nPjqJmkr9nKL8/pym2nddQNlOXADvWfezNrSCQ5fD8UfQo85O65MY6tXKWlpXl6enq8wxCRMrYuZzfXvTCDtTm7eezyPpx/XJt4h1SpmNksd08rbl1prg9HAjuAy8JrO/BC2YUnIhIbCzNzuejpr9i8I4/RNwxQcilnpend6ujuF0d9/p2ZzY1VQCIiZWH++m0MGzmD5OqRYcidNVKs3JXmCmaPmZ1c9MHMTgL2xC4kEZGjM2VZFlc8M406ydUZe/MgJZc4Kc0VzC3A6NAXA5HRZMNiF5KIyJHJLyjk8Y+X8+TkDLq2asCo6/vRooEebxwvpUkw2929t5k1AHD37WHYsIhIhbFnXwG3vzqbT5ZmcVHfFP5wYS/qJOsel3gqzU//LaCvu2+PKnsTOCE2IYmIHJ5N2/dyy8uzmLduG3/8US+uHnBMvEMSSr6TvxuR2Y0bmtlFUasaALrmFJEKYcaqHG57ZRa78goYcXVfBvdqHe+QJCjpCqYrcD7QCPhhVPkO4KZYBiUiUhpjZqzl/vELadu4Dq/eNFBzilUwJd3JPx4Yb2aD3H1qOcYkIlKi3fvy+c24hbw9J5NTuzTn8SuPp2HtGvEOSw5wyD4YJRcRqUi27Mxj+IszmZ+Zy11ndubOMzpRXXOKVUgaYiEiCWPqimx+8cY8snfl8ew1afygR8t4hyQlOGjaN7O7wvtJ5ReOiMh/Kih0/vbR11z13DSSq1fjtZsHKbkkgJKuYK4n8hyYJ4C+JWwnIhIz63J28/M35jFjVQ4XHZ/CH36k+1sSRUnf0hIzWw60MbP5UeUGuLsfF9vQRKSq+3DRRn7xxjwKHf5yaW8u7puiafYTSEmjyK40s1ZEnsVyQfmFJCJV3d79Bfzve0sYNXUNvVIaMOKqE2jftE68w5LDVOJ1prtvBHqbWTLQJRQvc/f9MY9MRKqkFZt3cserc1iyYTvDT+7ArwZ3pWb1pHiHJUfgkA2ZZnYqMJrIEygNaGdmw9z9sxjHJiJViLvz6oy1PPTOYuokJ/H8sDTO7K6O/ERWmp6yR4Gz3X0ZgJl1AcaguchEpIzk7t7PL96cx6TFmzilczP+cmlvWmoW5IRXmgRToyi5ALj712amW2ZFpEwsWJ/Lba/OYsO2vfzmvO7ccFIHqlVTR35lUJoEk25mzwEvh89XA3p4vYgcFXfn5Wlr+P27S2hWL5nXbxlE3/aN4x2WlKHSJJhbgduB/w6fPwdGxCwiEan0dublc+/bC3hn3jec1rU5j13Wh8Z1k+MdlpSx0sxFlkekH+bR2IcjIpXdso07uPWVWazesotfntOVW0/tqCaxSkq3w4pIuXB33py1nt+OX0i9mjV4+cYBnNixWbzDkhhSghGRmFu+aQcPvrOILzOyGXhsEx6/8nha1NcoscpOCUZEYmb3vnz+/vFynvt8FXWTk/jdBT25ekB7Ta9fRZT0yOR3AD/YenfX9DEiclDTVmbzyzfnsS5nD5elteXuwd1oWq9mvMOSclTSFcxfwvtFQCu+G6Z8JbAplkGJSOLas6+AP09cygtfrqZ9kzqMuWkggzo2jXdYEgclTXb5KYCZ/dXd06JWvWNmug9GRP7DzNU5/PKNeazO3s2wQcdw95Bumlq/CivNN1/XzI5195UAZtYBqBvbsEQkkWzfu5+/f7ScF75cRZtGtXXVIkDpEsxPgSlmtpLIZJfHAD+OaVQikhDcnXfmb+ChdxaTvSuPK/u359fndqdeTV21SOlutPzAzDoD3ULR0nDzpYhUYWuzd/Pb8Qv59OvNHNe2ISOvS+O4to3iHZZUIAcdK2hmZ4T3i4DzgI7hdV4oK5GZjTSzLDNbGFXWxMwmmdny8N44lJuZPW5mGWY238z6Ru0zLGy/3MyGRZWfYGYLwj6PW3jM3cHOISJlo6DQee7zlZzzt8+YtWYrvz2/B2/feqKSi/yHkgajnxref1jM6/xSHPtFYPABZfcAH7t7Z+Dj8BlgCNA5vG4GnoZIsgAeAAYA/YEHohLG08BNUfsNPsQ5ROQoLdu4g4uf/oo//GsJgzo25cOffp/hJ3fQfS1SrJJGkT0Q3q8/kgO7+2dmlnpA8VDgtLA8CpgC3B3KR7u7A9PMrJGZtQ7bTnL3HAAzmwQMNrMpQAN3nxbKRwMXAu+XcA4ROUJ5+QWMmLyCEVMyqF+rBn+/og8X9G5DaDgQKVZ598S1dPcNYXkjUPS4uhRgXdR260NZSeXriykv6Rz/wcxuJnLFRPv27Q+3LiJVwvSV2TwwYRFLN+5gaJ823H9+D90wKaUSt6Ee7u5mdtCZAsrjHO7+DPAMQFpaWkxjEUk0Wdv38vAHS3l7diatG9bSI4zlsJV3gtlkZq3dfUNoAssK5ZlAu6jt2oayTL5r7ioqnxLK2xazfUnnEJFS+mjxJn711nx27s3n9tM7cucZnalVIyneYUmCKWkushJHirn720dwvgnAMODh8D4+qvwOMxtLpEM/NySIicD/RHXsnw3c6+45ZrbdzAYC04FrgScOcQ4ROYTte/fzx3eX8Fr6Orq1qs/rPx5Epxb14h2WJKiSrmB+GN5bACcCn4TPpwNfASUmGDMbQ+Tqo5mZrScyGuxh4HUzGw6sAS4Lm78HnAtkALuB6wFCIvk9MDNs91BRhz9wG5GRarWJdO6/H8oPdg4ROQh35/2FG7l//CKyd+Vxy6kd+dlZXUiurtFhcuQsMnCrhA3MPgSGFXWch2anF939nHKIr9ykpaV5erqmWJOqJ3f3fh6YsJD/m/sNPds04OGLjuN7bRvGOyxJEGY264D5Kr9Vmj6YdlGjsiAyk7KGXIlUAu/O/4YHxi9i25793HVmZ+48o5PuaZEyU5oE83HoCxkTPl8OfBS7kEQk1tbl7OZ37yzmoyWb6N22IaNu6E+vFF21SNkqzVxkd4QO/1NC0TPuPi62YYlILOzLL+TZz1fy5CcZVDO4Z0g3btSd+BIjpRqmHEaMHcmoMRGpANydSYs38fAHS1m5eRdn92jJgxf0pE2j2vEOTSqxkoYp76D4RyYbkXsYG8QsKhEpM19v2sGDExbx1YpsOjavqxsmpdyUNBdZ/fIMRETK1o69+3nykwxGfrmKujWr89DQnlzZvz011Bwm5aRUTWRm1pvv+mA+c/f5sQtJRI7Wx0s28etxC8jakcfFfdty75Bumj9Myt0hE4yZ3UVkWvyiPphXzOwZd3+ihN1EJA5WbN7JH/+1hE+WZtG1ZX3+eU0afdrpOS0SH6W5ghkODHD3XQBm9idgKt9NzSIicZa7ez9/+/hrXpq6hto1kvj1ud0YdmIqNatr/jCJn9IkGAMKoj4XhDIRqQAmLd7Er96cR+6e/VzRvz0/O6sLzdQcJhVAaRLMC8B0Myu69+VC4PnYhSQipbErL59HJi7jxa9W0yulAa/eNJDurTW4UyqO0txo+Wh4guTJoeh6d58T06hE5KAKC51xczL5y4fL2JC7l2sHHcN953VXc5hUOKW90XI2MDvGsYjIIXy1Ygt//NcSFn2znePaNuSJK48nLbVJvMMSKVbcnmgpIqWXkbWTh99fwkdLskhpVJu/X9GHHx7XhmrV1B0qFZcSjEgFlr0zj799tJxXZ6ylTo0k7h7cjetPStXTJSUhKMGIVEB79xcw8stVjJi8gj37C7iqf3t+8oPOullSEooSjEgF4u6Mn/sNj0xcRua2PfygewvuGdJdjy2WhKQEI1IBuDuffr2Z/31vKcs27aBXSgMeufQ4TuzYLN6hiRwxJRiROFuYmct94xYwb30u7ZvU4bHLezO0d4o68CXhKcGIxEnmtj08/P5S3p3/Dc3q1eRPF3+PoX1S1IEvlYYSjEg527OvgGc/X8k/Pl1BoTu3nNqRW77fkYZ1asQ7NJEypQQjUk5y9+xn1FerGT11DVt25jG4ZyvuO6877ZrUiXdoIjGhBCMSY/sLChn11WqenJzBtt37Ob1rc249rRP9O+gOfKnclGBEYsTdmbR4Ew+9u5j1W/dwSudm3D24G71SGsY7NJFyoQQjEgPz1m3jkYnL+CJjC11b1mfkdWmc3rUFZhoZJlWHEoxIGXF3Plu+hRGTM5i+KofGdWrw2/N7cO2gY6iRVC3e4YmUOyUYkTKwfNMO7h+/iKkrs0lpVJt7h3TjqgHtqV9LI8Ok6lKCETkKmdv28PhHy3lj1jrq16rB7y7oyZX925NcXVcsIkowIkcgc9seRkzO4PX0dQBcOyiVO8/opMkoRaIowYgchtVbdvHPz1by5qxIYrksrR23nd6JlEa14xyZSMUTlwRjZquBHUABkO/uaWbWBHgNSAVWA5e5+1aLDLv5O3AusBu4LjxhEzMbBvwmHPYP7j4qlJ8AvAjUBt4D7nJ3L5fKSaWUvTOPxz76mrEz1lGtmimxiJRCPK9gTnf3LVGf7wE+dveHzeye8PluYAjQObwGAE8DA0JCegBIAxyYZWYT3H1r2OYmYDqRBDMYeL98qiWVyd79BTz/xSqenvLdc1nuPLMTLerXindoIhVeRWoiGwqcFpZHAVOIJJihwOhwBTLNzBqZWeuw7SR3zwEws0nAYDObAjRw92mhfDRwIUowchj27Cvglelr+OdnK9m8I4+zerTk7sHd9FwWkcMQrwTjwIdm5sA/3f0ZoKW7bwjrNwItw3IKsC5q3/WhrKTy9cWU/wczuxm4GaB9+/ZHUx+pJPbuL+DlaWt4esoKsnft48SOTXnyyuMZcGzTeIcmknDilWBOdvdMM2sBTDKzpdEr3d1D8ompkNieAUhLS1MfTRW2L7+Q19PX8eQnGWzcvpeTOjXlJz/oQr9UzRcmcqTikmDcPTO8Z5nZOKA/sMnMWrv7htAElhU2zwTaRe3eNpRl8l2TWlH5lFDetpjtRf5DXn4BE+Z+w1OTM1idvZt+qY157PI+DOqoKxaRo1XuCcbM6gLV3H1HWD4beAiYAAwDHg7v48MuE4A7zGwskU7+3JCEJgL/Y2aNw3ZnA/e6e46ZbTezgUQ6+a8Fniiv+kli2JC7h7Ez1vHK9LVs2ZlHl5b1eOH6fpzWpbnmCxMpI/G4gmkJjAv/iKsDr7r7B2Y2E3jdzIYDa4DLwvbvERminEFkmPL1ACGR/B6YGbZ7qKjDH7iN74Ypv486+CXI3LaHv036mrdmr8eB07o0Z/jJx3JSp6ZKLCJlzHR7SERaWpqnp6fHOwyJkYysHYyYvILx876hmsE1A1O57sRU2jfVw75EjoaZzXL3tOLWVaRhyiJlyt2ZsSqHZz9fycdLs6hZvRrDBqUy/JQOukFSpBwowUilU1jofLh4E09/uoJ567bRpG4yd5zeietP6kCTusnxDk+kylCCkUpjz74CXpu5lhe/Ws3q7N20b1KH3w/tySUntKN2clK8wxOpcpRgJOFl78zj1elrGTV1DVt25nHCMY352dldObdXK6rrQV8icaMEIwlrbfZuRn65ijEz1pKXX8j3uzTnttM6MlB33YtUCEowknBmrcnh2c9W8eHijVQz44I+bbjttI50alE/3qGJSBQlGEkI+QWFTFy0iee+WMmctdtoUKs6Pz61I8MGpdKqoWY2FqmIlGCkQtu6ax+vpa/j5WlrWL91D8c0rcNDQ3tycd+21K2pX1+Rikz/QqXCcXdmrdnK2JnreGfeN+TlF9IvtTG/Oa8HZ/VoSVI13XEvkgiUYKTC2L0vn7dnZzJ66mq+3laMDiAAAA0GSURBVLSTuslJXHxCW647MZUuLdW/IpJolGAk7lZv2cWYGWsZM2Mt2/fm0yulAX+++DjOO661msFEEpj+9Upc7Msv5INFGxkzfS1TV2aTVM0Y3LMVN5ycSt/2jTXxpEgloAQj5WpN9i7GzFjHm7PWsWXnPto1qc3Pz+rCZf3a0bKBRoOJVCZKMBJz+wsKmbR4E2NmrOXz5VtIqmac0a0FV/Vvz6ldmlNNnfYilZISjMTM2uzdjJ25ltfT17NlZx5tGtbiZ2d14bK0drp3RaQKUIKRMrVnXwGTl2V9e7VSzeCMbi25akA7Tu3SQkOMRaoQJRg5aoWFzozVOYybncm/FmxgZ14+bRrW4qc/6MJl/drSuqGevSJSFSnByBFbsXkn42ZnMm5OJpnb9lA3OYnBvVrzo+NTGNSxqa5WRKo4JRg5LFnb9/LO/A1MmJvJvPW5VDM4uXNzfjW4K2f1aEmdZP1KiUiE/hrIIeXu3s/7CzcwYd43TF2ZjTv0SmnAfed2Z2ifNrTQ8GIRKYYSjBRr8448Ply8kQ8WbmTqimzyC51jm9XlztM7cUGfFDq1qBfvEEWkglOCkW99s20PHyyMJJWZa3Jwh2Oa1mH4KR04t1drjmvbUHfYi0ipKcFUcau27ApJZQPz1ucC0K1Vff77jM4M+V4rurasr6QiIkdECaaKcXeWbNjBR0s28d6CDSzduAOA3m0bcvfgbpzTsyXHNlfzl4gcPSWYKmDv/gK+zNjC5GVZfLwkiw25ewHon9qE+8/vwTm9WpHSSPeqiEjZUoKppFZv2cUXGVuYvDSLL1dsYe/+QuokJ3FSp2b89KwunNqluSaXFJGYUoKpJAoLnbnrtzFl2WY+XLTx26avlEa1uaJfe87o1oIBxzahZvWkOEcqIlWFEkwCy96Zx7SVOUxZlsWUrzezeUce1QxOOKYx95/fg9O7tSC1aR110otIXCjBJJCsHXuZvjKHaSuzmb4qh4ysnQA0qFWdU7o056zuLTmta3Ma1UmOc6QiIkowFdrG3L0hmWQzfWUOK7fsAqBuchJpqU24qG8KAzo0pXfbhlRPqhbnaEVE/p0STAVRUOis2rKTWWu2Mm1lDnPWbmV19m4A6tesTr8OTbi8XzsGHtuUnm0aKKGISIVXaROMmQ0G/g4kAc+5+8NxDulb+/ILWbF5Jwsyc1mUmcvCb7azdMN2du0rAKBZvWT6tm/M1QOOYVDHpnRv3UAzE4tIwqmUCcbMkoCngLOA9cBMM5vg7otjed7d+/LZsmMfW3blsWVHHtm79pG9M48tO/exZWceW3bmkbUjj7XZu8kvdCDS3NWzTUMuTWtHzzYNOL59Izo2r6eOeRFJeJUywQD9gQx3XwlgZmOBoUCZJ5jHP17OG7PWsWXHPvbsLyh2m/o1q9Osfk2a1k2mS4v6DO7Ziq6t6tMrpSEdmtbVM+lFpFKqrAkmBVgX9Xk9MODAjczsZuBmgPbt2x/RiVo2qMkJ7RvTtF5NmtWrSdN6yTQP783q1aRJ3WRq1dC9JyJS9VTWBFMq7v4M8AxAWlqaH8kxLu/Xnsv7HVlyEhGpzCrrUKRMoF3U57ahTEREykllTTAzgc5m1sHMkoErgAlxjklEpEqplE1k7p5vZncAE4kMUx7p7oviHJaISJVSKRMMgLu/B7wX7zhERKqqytpEJiIicaYEIyIiMaEEIyIiMaEEIyIiMWHuR3R/YaVjZpuBNUe4ezNgSxmGE0+qS8VTWeoBqktFdTR1Ocbdmxe3QgmmDJhZurunxTuOsqC6VDyVpR6gulRUsaqLmshERCQmlGBERCQmlGDKxjPxDqAMqS4VT2WpB6guFVVM6qI+GBERiQldwYiISEwowYiISEwowRwlMxtsZsvMLMPM7ol3PMUxs9VmtsDM5ppZeihrYmaTzGx5eG8cys3MHg/1mW9mfaOOMyxsv9zMhpVT7CPNLMvMFkaVlVnsZnZC+NlkhH1j9vzqg9TlQTPLDN/NXDM7N2rdvSGuZWZ2TlR5sb9z4fEU00P5a+FRFbGoRzszm2xmi81skZndFcoT7nspoS6J+L3UMrMZZjYv1OV3JZ3fzGqGzxlhfeqR1vGg3F2vI3wReRTACuBYIBmYB/SId1zFxLkaaHZA2Z+Be8LyPcCfwvK5wPuAAQOB6aG8CbAyvDcOy43LIfbvA32BhbGIHZgRtrWw75ByrsuDwC+K2bZH+H2qCXQIv2dJJf3OAa8DV4TlfwC3xqgerYG+Ybk+8HWIN+G+lxLqkojfiwH1wnINYHr4GRZ7fuA24B9h+QrgtSOt48FeuoI5Ov2BDHdf6e77gLHA0DjHVFpDgVFheRRwYVT5aI+YBjQys9bAOcAkd89x963AJGBwrIN098+AnFjEHtY1cPdpHvmXNTrqWOVVl4MZCox19zx3XwVkEPl9K/Z3LvwP/wzgzbB/9M+lTLn7BnefHZZ3AEuAFBLweymhLgdTkb8Xd/ed4WON8PISzh/9fb0JnBniPaw6lhSTEszRSQHWRX1eT8m/nPHiwIdmNsvMbg5lLd19Q1jeCLQMywerU0Wqa1nFnhKWDywvb3eEpqORRc1KHH5dmgLb3D3/gPKYCs0qxxP533JCfy8H1AUS8HsxsyQzmwtkEUnYK0o4/7cxh/W5Id4y+xugBFM1nOzufYEhwO1m9v3oleF/iQk5Xj2RYw+eBjoCfYANwF/jG07pmVk94C3gJ+6+PXpdon0vxdQlIb8Xdy9w9z5AWyJXHN3iGY8SzNHJBNpFfW4byioUd88M71nAOCK/eJtCUwThPStsfrA6VaS6llXsmWH5wPJy4+6bwh+FQuBZIt8NHH5dsok0PVU/oDwmzKwGkT/Ir7j726E4Ib+X4uqSqN9LEXffBkwGBpVw/m9jDusbhnjL7G+AEszRmQl0DqM0kol0lE2Ic0z/xszqmln9omXgbGAhkTiLRu0MA8aH5QnAtWHkz0AgNzR7TATONrPGobng7FAWD2USe1i33cwGhrbna6OOVS6K/iAHPyLy3UCkLleEkT4dgM5EOr6L/Z0LVwyTgUvC/tE/l7KO2YDngSXu/mjUqoT7Xg5WlwT9XpqbWaOwXBs4i0if0sHOH/19XQJ8EuI9rDqWGFRZj2Soai8iI2S+JtLWeV+84ykmvmOJjPaYBywqipFIW+vHwHLgI6BJKDfgqVCfBUBa1LFuINLhlwFcX07xjyHSRLGfSJvv8LKMHUgj8sdjBfAkYXaLcqzLSyHW+eEfa+uo7e8LcS0jahTVwX7nwnc9I9TxDaBmjOpxMpHmr/nA3PA6NxG/lxLqkojfy3HAnBDzQuD+ks4P1AqfM8L6Y4+0jgd7aaoYERGJCTWRiYhITCjBiIhITCjBiIhITCjBiIhITCjBiIhITCjBiJQTM0szs8cPsm61mTU7jGNdamZLzGzyAeWpZnZVKfa/zsyeLO35RI6EEoxIOXH3dHf/7zI63HDgJnc//YDyVOCQCUakPCjBiAThf/9LzOxZizxP48NwRzRm1tHMPggThn5uZt3CxIKrwh3qjcysoGieNzP7zMw6H3D808zs3bDcNBx/kZk9R+RmxOJiutIiz0VZaGZ/CmX3E7lB8Hkze+SAXR4GTrHIM0x+apFnhLwQjjHHzA5MSJjZeWY21cyamdnZYXm2mb1hkTm6iq6wfhfKF5hZXOe4ksSgBCPy7zoDT7l7T2AbcHEofwa4091PAH4BjHD3AiJ3Ovcg8gd/NpE/7jWBdu6+vITzPAB8Ec4zDmh/4AZm1gb4E5Hp1vsA/czsQnd/CEgHrnb3Xx6w2z3A5+7ex90fA24nMvfk94ArgVFmVivqHD8K+xQ9UOs3wA88MjlqOvCzqGNvCeVPh5+BSImqH3oTkSpllbvPDcuzgNTwv/gTgTfsuwcr1gzvnxN5kFgH4H+Bm4BPiczbVJLvAxcBuPu/zGxrMdv0A6a4+2YAM3sl7Pd/h1Gfk4EnwnmWmtkaoEtYdwaRKVnOdvftZnY+kWT5ZahnMjA16lhFk1rOKopdpCRKMCL/Li9quQCoTeRKf5tHpkE/0GfArUAb4H7gl8BpRBJPRVf0dMIuRK5WjMgDwK48yPZFP5sC9LdDSkFNZCKH4JHng6wys0vh22fM9w6rZxC5uil0971EJkv8MZHEU5LPCJ3xZjaEyCODDzQDODX0jSQRaeL69BDH3UHk0b9FPgeuDufpQqQpbllYt4ZIE+BoM+sJTANOMrNOYfu6YR+RI6IEI1I6VwPDzaxoVuqhAO6eR+Qpf9PCdp8T+QO/4BDH+x3wfTNbRKS5ae2BG3hk2vp7iEy3Pg+Y5e6Hmup9PlBgZvPM7KfACKCamS0AXgOuCzEXnWNpqNsbQAPgOmCMmc0n0jymznw5YppNWUREYkJXMCIiEhNKMCIiEhNKMCIiEhNKMCIiEhNKMCIiEhNKMCIiEhNKMCIiEhP/D3XNhJEm46cWAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IaaCyAPlomLt"
},
"source": [
"### Update the embeddings"
]
},
{
"cell_type": "code",
"metadata": {
"id": "k-BNn3R6R0lY"
},
"source": [
"import torch"
],
"execution_count": 43,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "P5033SckRzzo"
},
"source": [
"new_size = len(kept_ids)\n",
"new_emb = torch.nn.Embedding(new_size, model.shared.embedding_dim)\n",
"new_head = torch.nn.Linear(in_features=model.lm_head.in_features, out_features=new_size, bias=False)"
],
"execution_count": 44,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "CjD6LS_9fe_M"
},
"source": [
"for new_id, old_id in enumerate(kept_ids):\n",
" new_emb.weight.data[new_id] = model.shared.weight.data[old_id]\n",
" new_head.weight.data[new_id] = model.lm_head.weight.data[old_id]"
],
"execution_count": 45,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "vv7IuBORRseE"
},
"source": [
"model.shared.weight = new_emb.weight\n",
"model.lm_head.weight = new_head.weight"
],
"execution_count": 46,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "QcIDtmymo56s"
},
"source": [
"The new model has 244M parameters - 42% of the original size. "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "g_aPBQ20kvCB",
"outputId": "d2289964-728d-45b2-afc9-e366e1d6b98b"
},
"source": [
"print(msize(model), msize(model) / original_size)"
],
"execution_count": 48,
"outputs": [
{
"output_type": "stream",
"text": [
"244309248 0.4194861110195362\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "vdKmFJY_k7xZ"
},
"source": [
"### Update the tokenizer"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-X25sG0jmc83"
},
"source": [
"T5 uses Sentencepiece tokenizer, which is implemented in C and is opaque to Python. \n",
"\n",
"Fortunately, we can download its model and deploy it into Python using its Protobuf representation. \n",
"\n",
"https://github.com/google/sentencepiece/issues/121"
]
},
{
"cell_type": "code",
"metadata": {
"id": "OpII_eX3mY80"
},
"source": [
"!wget https://raw.githubusercontent.com/google/sentencepiece/master/src/sentencepiece_model.proto"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "SGb1DiYmpnkr"
},
"source": [
"We compile the protobuf description of the sentencepiece model in order to be able to modify it. "
]
},
{
"cell_type": "code",
"metadata": {
"id": "I6B0MA5DmaZM"
},
"source": [
"! protoc --python_out=. sentencepiece_model.proto"
],
"execution_count": 51,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "nJwHRRzbngJY"
},
"source": [
"Now we can serialize the model used by the current tokenizer and open it as a protobuf class. "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 117,
"referenced_widgets": [
"b432b120350b42388447dcfcf959d673",
"5660d17e6c0e40b7bee6dc3d1af46f34",
"9dcf8c4bef5342aba837077c4904d852",
"d9a80cc59ada42fe908726abd05942bf",
"847d7a0c9f4a418e93484cd9bf8b8e0a",
"6c74fe4ffc4f4013bf73d00212e5775a",
"5a17271685624ad3bd2aa9c88504f969",
"1780d72d89b5449eaa0dcea9c595b6d7"
]
},
"id": "MdQM0L3lnybA",
"outputId": "aa4db10b-f8c2-48fe-8e67-2afa6586550c"
},
"source": [
"import sentencepiece_model_pb2 as spmp\n",
"smp = tokenizer.sp_model.serialized_model_proto()\n",
"m = spmp.ModelProto()\n",
"m.ParseFromString(smp)\n",
"\n",
"print('the loaded model has pieces:', len(m.pieces))\n",
"new_pieces = [m.pieces[idx] for idx in kept_ids]\n",
"print('the new pieces:', len(new_pieces))\n",
"\n",
"# replace the content of the first 30K pieces\n",
"for i, p in enumerate(new_pieces):\n",
" m.pieces[i].piece = p.piece\n",
" m.pieces[i].score = p.score\n",
" m.pieces[i].type = p.type\n",
"\n",
"# drop the remaining pieces\n",
"n = len(new_pieces)\n",
"for i in trange(len(m.pieces) - n):\n",
" m.pieces.pop(len(m.pieces) - 1)\n",
"\n",
"print(len(m.pieces))\n",
"with open('new_sp.model', 'wb') as f:\n",
" f.write(m.SerializeToString())"
],
"execution_count": 56,
"outputs": [
{
"output_type": "stream",
"text": [
"the loaded model has pieces: 250100\n",
"the new pieces: 30000\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b432b120350b42388447dcfcf959d673",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=220100.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n",
"30000\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "qWeP6N1sry93"
},
"source": [
"new_tokenizer = T5Tokenizer('new_sp.model', extra_ids=0)"
],
"execution_count": 78,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "czfXG1IqsDT4"
},
"source": [
"### Save the model"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oanCNPiIsCdU",
"outputId": "574a65e7-a1f4-465b-aa28-f6b2d9990200"
},
"source": [
"model.config.__dict__['vocab_size'] = new_size\n",
"model.config.__dict__['_name_or_path'] = 'cointegrated/rut5-base'\n",
"model.config"
],
"execution_count": 79,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"T5Config {\n",
" \"_name_or_path\": \"cointegrated/rut5-base\",\n",
" \"architectures\": [\n",
" \"T5ForConditionalGeneration\"\n",
" ],\n",
" \"d_ff\": 2048,\n",
" \"d_kv\": 64,\n",
" \"d_model\": 768,\n",
" \"decoder_start_token_id\": 0,\n",
" \"dropout_rate\": 0.1,\n",
" \"eos_token_id\": 1,\n",
" \"feed_forward_proj\": \"gated-gelu\",\n",
" \"initializer_factor\": 1.0,\n",
" \"is_encoder_decoder\": true,\n",
" \"layer_norm_epsilon\": 1e-06,\n",
" \"model_type\": \"t5\",\n",
" \"num_decoder_layers\": 12,\n",
" \"num_heads\": 12,\n",
" \"num_layers\": 12,\n",
" \"output_past\": true,\n",
" \"pad_token_id\": 0,\n",
" \"relative_attention_num_buckets\": 32,\n",
" \"tie_word_embeddings\": false,\n",
" \"tokenizer_class\": \"T5Tokenizer\",\n",
" \"transformers_version\": \"4.5.1\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 30000\n",
"}"
]
},
"metadata": {
"tags": []
},
"execution_count": 79
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "UaebisNqr4Mk"
},
"source": [
"new_tokenizer.save_pretrained('rut5-base')\n",
"model.save_pretrained('rut5-base')"
],
"execution_count": 81,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nIoB98_9r7VU",
"outputId": "b7d1858d-d51f-4cc6-ecc2-86bc59f6a36d"
},
"source": [
"!ls rut5-base -alsh"
],
"execution_count": 82,
"outputs": [
{
"output_type": "stream",
"text": [
"total 933M\n",
"4.0K drwxr-xr-x 2 root root 4.0K Apr 30 21:26 .\n",
"4.0K drwxr-xr-x 1 root root 4.0K Apr 30 21:26 ..\n",
"4.0K -rw-r--r-- 1 root root 677 Apr 30 21:33 config.json\n",
"933M -rw-r--r-- 1 root root 933M Apr 30 21:33 pytorch_model.bin\n",
"4.0K -rw-r--r-- 1 root root 65 Apr 30 21:33 special_tokens_map.json\n",
"812K -rw-r--r-- 1 root root 809K Apr 30 21:33 spiece.model\n",
"4.0K -rw-r--r-- 1 root root 116 Apr 30 21:33 tokenizer_config.json\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5gFLD5dUs7gZ"
},
"source": [
"Now try to load the model"
]
},
{
"cell_type": "code",
"metadata": {
"id": "ewebox5usyq9"
},
"source": [
"model1 = T5ForConditionalGeneration.from_pretrained('rut5-base')\n",
"tokenizer1 = T5Tokenizer.from_pretrained('rut5-base')"
],
"execution_count": 83,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "_GVnO2C0ruQx"
},
"source": [
"The model has not been fine-tuned on any sensible task except filling the gaps. And even this task is performed strangely - the models continues generating when it should have stopped. \n",
"\n",
"But we hope that after fine-tuning it will be better. But this is the topic of the next story)"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "08zibfjgtNhF",
"outputId": "9e9f2025-54d0-4a21-ede0-a023c38383b7"
},
"source": [
"inputs = tokenizer1('The <extra_id_0> walks in <extra_id_1> park.', return_tensors='pt')\n",
"with torch.no_grad():\n",
" hypotheses = model1.generate(\n",
" **inputs, \n",
" do_sample=True, top_p=0.95, \n",
" num_return_sequences=3, \n",
" repetition_penalty=2.5,\n",
" max_length=32,\n",
" )\n",
"for h in hypotheses:\n",
" print(tokenizer1.decode(h))"
],
"execution_count": 88,
"outputs": [
{
"output_type": "stream",
"text": [
"<pad> <extra_id_0> evening on <extra_id_1> the <extra_id_2> the park</s> <pad> <pad> <pad>\n",
"<pad> <extra_id_0> Great <extra_id_1> the <extra_id_2> a <extra_id_3> nature center,</s> <pad>\n",
"<pad> <extra_id_0> forest <extra_id_1> this <extra_id_2> a <extra_id_3> summer in the...</s>\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tsR9lH3_uqF3",
"outputId": "a6f14551-247b-45e1-e3a9-fac939079322"
},
"source": [
"inputs = tokenizer1('Красивая <extra_id_0> гуляет <extra_id_1> парку.', return_tensors='pt')\n",
"with torch.no_grad():\n",
" hypotheses = model1.generate(\n",
" **inputs, \n",
" do_sample=True, top_p=0.95, \n",
" num_return_sequences=3, \n",
" repetition_penalty=2.5,\n",
" max_length=32,\n",
" )\n",
"for h in hypotheses:\n",
" print(tokenizer1.decode(h))"
],
"execution_count": 89,
"outputs": [
{
"output_type": "stream",
"text": [
"<pad> <extra_id_0> птица <extra_id_1> в <extra_id_2>, <extra_id_3>. Гул <extra_id_4>! Красивый <extra_id_5> молодец</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>\n",
"<pad> <extra_id_0> музыка <extra_id_1> в <extra_id_2> в <extra_id_3> осеннее платье в <extra_id_4> в <extra_id_5> и</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>\n",
"<pad> <extra_id_0> женщина, она <extra_id_1> по <extra_id_2> в <extra_id_3>. Красивый <extra_id_39>! Настроение - красиво во всем лесном</s>\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2nZt98FYwcex"
},
"source": [
"I will save the model on my Google drive to retrieve it later for fine-tuning. "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "iMG9dNShwg9U",
"outputId": "9a9de585-085e-47a2-8e03-068a95e6857f"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/gd')"
],
"execution_count": 91,
"outputs": [
{
"output_type": "stream",
"text": [
"Mounted at /gd\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_j56QoXBwjCS",
"outputId": "c3df1fb9-49b6-4fbd-94c6-b0ffc38e2fca"
},
"source": [
"model1.save_pretrained('/gd/MyDrive/models/rut5-base-raw')\n",
"tokenizer1.save_pretrained('/gd/MyDrive/models/rut5-base-raw')"
],
"execution_count": 92,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('/gd/MyDrive/models/rut5-base-raw/tokenizer_config.json',\n",
" '/gd/MyDrive/models/rut5-base-raw/special_tokens_map.json',\n",
" '/gd/MyDrive/models/rut5-base-raw/spiece.model',\n",
" '/gd/MyDrive/models/rut5-base-raw/added_tokens.json')"
]
},
"metadata": {
"tags": []
},
"execution_count": 92
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "57AlTaqpw2Ew"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
@wilfoderek
Copy link

Nice work

@wilfoderek
Copy link

I have question. I suppose if i have to fine tuning on my own text y must only extract sentence from my documents and it will work, right?

@avidale
Copy link
Author

avidale commented Aug 25, 2022

I suppose if i have to fine tuning on my own text y must only extract sentence from my documents and it will work, right?

You can fine-tune the model created by my script exactly the same way as you would fine-tune any other seq2seq transformer.

@AimeM250
Copy link

Quick question, how could I feed the BIO format of the sentences as part of the input fed to the model?

@avidale
Copy link
Author

avidale commented Nov 12, 2022

@AimeM250 it depends.
Normally, BIO format is used for sequence tagging problems (such as named entity recognition). But for these problems, an encoder-decoder architecture such as T5 is redundant; encoder-only models (such as BERT) are usually enough.

What exactly are you trying to achieve?

@EDF99
Copy link

EDF99 commented Jul 14, 2023

Awesome work man. It helps a ton! thanks for your contribution to the community.

I have a question. Why do you get the model with T5ForConditionalGeneration() instead of MT5ForConditionalGeneration()?
Would there be any difference to get the model using MT5ForConditionalGeneration() ?

Because when I finished creating the mt5 with the spanish language, the model.config states that the architecture is MT5ForConditionalGeneration, while yours states that it is T5ForConditionalGeneration. I don't know if its something I should just ignore, of I messed up somewhere.

@avidale
Copy link
Author

avidale commented Jul 16, 2023

Hi @EDF99,
I never encountered any problems by using MT5ForConditionalGeneration and T5ForConditionalGeneration interchangeably.
The comments in MT5 source code (https://github.com/huggingface/transformers/blob/v4.30.0/src/transformers/models/mt5/modeling_mt5.p) suggest that most of it was copy-pasted from T5.
This doesn't guarantee that they are 100% compatible, but I have an impression that they are.

@EDF99
Copy link

EDF99 commented Jul 19, 2023

Thanks man @avidale!

@ozzieandthestraw
Copy link

Hi, thanks a lot for this, it's super helpful and I've managed to get it to work well.

One question: do you have any idea how could I get this to work with XLM-R? They both use sentencepiece tokenizers, but I can't seem to get it to work.

All of my attempts so far have resulted in an "unkown error" on HF's inference API, and an "index out of range in self" error when I try to use them myself.

@avidale
Copy link
Author

avidale commented Oct 31, 2023

@ozzieandthestraw you are talking about dropping tokens in models like https://huggingface.co/xlm-roberta-base, right? I could try adapting my code for this.

@ozzieandthestraw
Copy link

@avidale yep, that's the one, thank you. I think the biggest issue is with updating the embeddings, the rest of your code seems to work fine.

@avidale
Copy link
Author

avidale commented Oct 31, 2023

@ozzieandthestraw Ok, here is my notebook for updating XLM-Roberta: https://colab.research.google.com/drive/1f-n3zBQjmtMrp7oHzvunHPSC5aIMNe_N?usp=sharing.

The principle is the same as the notebook above. A nice difference is that we no longer need to compile sentencepiece_model.proto manually; nowadays the required objects are already included in the sentencepiece distribution.

@ozzieandthestraw
Copy link

@avidale thank you so much! That works excellently, you are a life saver.

One thing, I changed the line tokenizer_new = XLMRobertaTokenizer.from_pretrained('tmp_tokenizer') to tokenizer_new = XLMRobertaTokenizer.from_pretrained('tmp_tokenizer/sentencepiece.bpe.model').

@rukaiya-rk-24
Copy link

@avidale thank you so much! That works excellently.

I am trying to fine-tune a model which works like ChatGPT for Punjabi language, using the mt5-base, however I am not sure if I should go ahead with it since it does not even generate text and when I try to use it, I just get a response as <extra_pad> 0. I have checked the tokenizers, they work fine with Punjabi language, can anyone please tell how may I go on about it?

The dataset I will be using is an instruction following dataset in the format of alpaca and is of high quality.

I have tried fine-tuning indic-gpt before, however it has a very small token size i.e.1024 so I changed my base model.

Thanks in advance!

@avidale
Copy link
Author

avidale commented Nov 11, 2023

Hi @rukaiya-rk-24 !
I have never fine-tuned ChatGPT-like models and I know very little about Punjabi, so I cannot tell you for sure. But what I know is:

  1. MT5 probably hasn’t seen Punjabi at all during pretraining (or maybe only a little bit of it that got mixed into other languages by accident). The list of languages that MT5 has seen is at https://huggingface.co/google/mt5-base.
  2. MT5 wasn’t pretrained to generate full sentences at all, only to predict missing tokens. Thus, to make it generate long coherent answers, like chatgpt, you need to train it for a very long time on a very big diverse dataset. I tried to do this with MT5 for Russian, which is one of its pretraining languages (so it should be easier), but still the results were far from perfect.

So I would recommend to you to use instead of mT5 a model that (1) has been pretrained with Punjabi as one of languages, and (2) has been pretrained with the autoregressive language modelling task (the one that GPT models also use), so that it already can generate fluent texts. One model that fulfills these criteria is BLOOM, so I suggest picking the largest of the BLOOM models from https://huggingface.co/bigscience?sort_models=likes#models that fits into your memory during fine-tuning (e.g. the 1B version).

@rukaiya-rk-24
Copy link

Thank you so much for the help!

@Sandeep0408
Copy link

Sandeep0408 commented Nov 16, 2023

Hi @avidale, I'm trying to run a sentiment classification on a Dutch dataset using the tokenizer as :
tokenizer = T5TokenizerFast.from_pretrained('yhavinga/t5-base-dutch')

and below arguments for training :
model_name_or_path="yhavinga/t5-base-dutch",
tokenizer_name_or_path="https://huggingface.co/yhavinga/t5-base-dutch/blob/main/tokenizer.json"

When trying to train the model , getting an error
170 def LoadFromFile(self, arg):
--> 171 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)

RuntimeError: Internal: /tmp/pip-install-ilecn6h0/sentencepiece_3c5f89f9146b4090a668d0f42db59389/bundled/sentencepiece/src/sentencepiece_processor.cc(823) [model_proto->ParseFromArray(serialized.data(), serialized.size())].

I'm done following changes but still no luck

  1. Changed the version of transformer (currently its v4.35.2), tokenizers (currently it's 0.15.0) and sentencepiece.
  2. Changed T5TokenizerFast to AutoTokenizer but still the issue is persistant.
  3. Tried running it on an English dataset, which works fine but whenver I make changes on tokenizer ,model_name_or_path and tokenizer_name_or_path. I face the above said Issue.

Could you help? Thanks in Advance

@avidale
Copy link
Author

avidale commented Nov 16, 2023

When you say "arguments for training", where exactly do you use them? Are you using a huggingface trainer or something else?

If you give me a minimal example of code that can reproduce your problem, it would be easier for me to help.

My first guess is that you should replace

tokenizer_name_or_path="https://huggingface.co/yhavinga/t5-base-dutch/blob/main/tokenizer.json"

with simply

tokenizer_name_or_path="yhavinga/t5-base-dutch"

but without more context, I cannot be sure.

@Sandeep0408
Copy link

Thanks for your response @avidale , Yes I'm using a HF trainer and the arguments are for that. I did made the change as suggested by you but getting a different error as below

-> 310 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
311
312 def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):

TypeError: not a string

I've been following this notebook ("https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb#scrollTo=hcKmeIGiI582"). Appreciate the help!

@avidale
Copy link
Author

avidale commented Nov 16, 2023

Now it looks like a problem with incorrect input.
But again, without knowing the exact code that led to the error, I cannot say for sure.

@Sandeep0408
Copy link

Would this link of gist help : https://gist.github.com/Sandeep0408/236b164cb09408c920aedb15d5c7e984

If not, I can give you the access for the colab notebook via mail. Thanks!

@WEXIJUE
Copy link

WEXIJUE commented Nov 20, 2023

Hello, I would like to know what version of python you are using, I saved the model as model.safetensors instead of pytorch_model.bin, please do you have any solution, thank you very much

@Nehc
Copy link

Nehc commented Mar 2, 2024

Should this work with XLMRobertaModel, like e5-large? Or is something fundamentally different being used there. It didn't work out for me.

@avidale
Copy link
Author

avidale commented Mar 2, 2024

@Nehc

Should this work with XLMRobertaModel, like e5-large? Or is something fundamentally different being used there. It didn't work out for me.

As I can judge from the HF documentation, XLMRobertaTokenizer is based on SentencePiece, just like T5Tokenizer. Thus, in principle, the approach should work; I don't see any fundamental reasons why it wouldn't.

Nevertheless, the specific details, such as model parameter names, tokenizer parameter names, special tokens etc. may differ between T5 and XLMRoberta, so my code will surely need some adaptation to work with E5.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment