Skip to content

Instantly share code, notes, and snippets.

@ritog
Created July 17, 2021 10:42
Show Gist options
  • Save ritog/757c32d5b5065a91e63beb017e38dba9 to your computer and use it in GitHub Desktop.
Save ritog/757c32d5b5065a91e63beb017e38dba9 to your computer and use it in GitHub Desktop.
Copy of robi-poems.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Copy of robi-poems.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"15cfa01ee073450886a880886604c562": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_9caabaee8a0a499f9db335174e2295d5",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_62462f8646464a38aa5fceb53af0c72b",
"IPY_MODEL_724b8ea1b43647ba860325090ed99250"
]
}
},
"9caabaee8a0a499f9db335174e2295d5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"62462f8646464a38aa5fceb53af0c72b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_608a96da3b834b2dbee688614c041fbd",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 881,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 881,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_15031a8ce2c14dd8a1af5112abc4bc99"
}
},
"724b8ea1b43647ba860325090ed99250": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_4557a319d0024a8f8d3d5e10137e8120",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 881/881 [00:08<00:00, 107B/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_25fe9839fa41409fa1b8be4f9caa1dc4"
}
},
"608a96da3b834b2dbee688614c041fbd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"15031a8ce2c14dd8a1af5112abc4bc99": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"4557a319d0024a8f8d3d5e10137e8120": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"25fe9839fa41409fa1b8be4f9caa1dc4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"2c2664b316e843d98e97ed6ddd78ad1d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_87b54d2be7e44a14b71880aa7f0977fa",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_0e26bf1b7ff64064858b55f64c13e5c9",
"IPY_MODEL_10067a00aa7f46e895a11ea42c1a0c83"
]
}
},
"87b54d2be7e44a14b71880aa7f0977fa": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"0e26bf1b7ff64064858b55f64c13e5c9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_5e51cc0230f9477ea229a9c8cf0a72c5",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1708956,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1708956,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_b7b5ac4bfa484e05bfb65ecd0fdcabec"
}
},
"10067a00aa7f46e895a11ea42c1a0c83": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_90fe734421514632b7455707fdb6647a",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1.71M/1.71M [00:06<00:00, 253kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_fef095a5d6e047dcb79bff65a0b80124"
}
},
"5e51cc0230f9477ea229a9c8cf0a72c5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"b7b5ac4bfa484e05bfb65ecd0fdcabec": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"90fe734421514632b7455707fdb6647a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"fef095a5d6e047dcb79bff65a0b80124": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"580bb65ee3824889b5631568b99a58c8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_aac5891278e84c5291e5b0164044116e",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_66d20b10edfe4958a39368199fcaad3b",
"IPY_MODEL_306b0cfe44594bca8befbf05d6deff51"
]
}
},
"aac5891278e84c5291e5b0164044116e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"66d20b10edfe4958a39368199fcaad3b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_072fc33d415b4421a069677ad61ff011",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 510401385,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 510401385,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_9e3c7fe1fff44b41af0798b7fe192f0f"
}
},
"306b0cfe44594bca8befbf05d6deff51": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_94f04430f25f49beb366ab3ca728ec3a",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 510M/510M [00:13<00:00, 36.7MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_6f4cca4a80274eadaf20d28acb074d6a"
}
},
"072fc33d415b4421a069677ad61ff011": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"9e3c7fe1fff44b41af0798b7fe192f0f": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"94f04430f25f49beb366ab3ca728ec3a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"6f4cca4a80274eadaf20d28acb074d6a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ghosh-r/757c32d5b5065a91e63beb017e38dba9/copy-of-robi-poems.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "dV9hhxEKF48s"
},
"source": [
"%%capture\n",
"! pip install datasets transformers"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "MZ13avB4Ebnw"
},
"source": [
"# import re\n",
"import json\n",
"from sklearn.model_selection import train_test_split\n",
"import pandas as pd"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 132,
"referenced_widgets": [
"15cfa01ee073450886a880886604c562",
"9caabaee8a0a499f9db335174e2295d5",
"62462f8646464a38aa5fceb53af0c72b",
"724b8ea1b43647ba860325090ed99250",
"608a96da3b834b2dbee688614c041fbd",
"15031a8ce2c14dd8a1af5112abc4bc99",
"4557a319d0024a8f8d3d5e10137e8120",
"25fe9839fa41409fa1b8be4f9caa1dc4",
"2c2664b316e843d98e97ed6ddd78ad1d",
"87b54d2be7e44a14b71880aa7f0977fa",
"0e26bf1b7ff64064858b55f64c13e5c9",
"10067a00aa7f46e895a11ea42c1a0c83",
"5e51cc0230f9477ea229a9c8cf0a72c5",
"b7b5ac4bfa484e05bfb65ecd0fdcabec",
"90fe734421514632b7455707fdb6647a",
"fef095a5d6e047dcb79bff65a0b80124"
]
},
"id": "yGHBLbrGIWyV",
"outputId": "eb5296d4-3223-4dbc-de2c-be5bb8c70128"
},
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"ghosh-r/bangla-gpt2\")\n",
"\n",
"train_path = 'train.txt'\n",
"test_path = 'test.txt'"
],
"execution_count": 7,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "15cfa01ee073450886a880886604c562",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=881.0, style=ProgressStyle(description_…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2c2664b316e843d98e97ed6ddd78ad1d",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1708956.0, style=ProgressStyle(descript…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1b4f-k8EIdXk",
"outputId": "f18dc59f-21cf-4268-8622-65f56430b036"
},
"source": [
"from transformers import TextDataset,DataCollatorForLanguageModeling\n",
"\n",
"def load_dataset(train_path,test_path,tokenizer):\n",
" train_dataset = TextDataset(\n",
" tokenizer=tokenizer,\n",
" file_path=train_path,\n",
" block_size=128)\n",
" \n",
" test_dataset = TextDataset(\n",
" tokenizer=tokenizer,\n",
" file_path=test_path,\n",
" block_size=128) \n",
" \n",
" data_collator = DataCollatorForLanguageModeling(\n",
" tokenizer=tokenizer, mlm=False,\n",
" )\n",
" return train_dataset,test_dataset,data_collator\n",
"\n",
"train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)"
],
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.7/dist-packages/transformers/data/datasets/language_modeling.py:58: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_mlm.py\n",
" FutureWarning,\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 120,
"referenced_widgets": [
"580bb65ee3824889b5631568b99a58c8",
"aac5891278e84c5291e5b0164044116e",
"66d20b10edfe4958a39368199fcaad3b",
"306b0cfe44594bca8befbf05d6deff51",
"072fc33d415b4421a069677ad61ff011",
"9e3c7fe1fff44b41af0798b7fe192f0f",
"94f04430f25f49beb366ab3ca728ec3a",
"6f4cca4a80274eadaf20d28acb074d6a"
]
},
"id": "FuETPslLIo4U",
"outputId": "51042dd7-674a-4cbc-868c-e33dda336ea7"
},
"source": [
"from transformers import Trainer, TrainingArguments,AutoModelWithLMHead\n",
"\n",
"model = AutoModelWithLMHead.from_pretrained(\"ghosh-r/bangla-gpt2\")\n",
"\n",
"\n",
"training_args = TrainingArguments(\n",
" output_dir=\"./gpt2-robi_kobita\", #The output directory\n",
" overwrite_output_dir=True, #overwrite the content of the output directory\n",
" num_train_epochs=3, # number of training epochs\n",
" per_device_train_batch_size=32, # batch size for training\n",
" per_device_eval_batch_size=64, # batch size for evaluation\n",
" eval_steps = 400, # Number of update steps between two evaluations.\n",
" save_steps=800, # after # steps model is saved \n",
" warmup_steps=500,# number of warmup steps for learning rate scheduler\n",
" prediction_loss_only=True,\n",
" )\n",
"\n",
"\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" data_collator=data_collator,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=test_dataset,\n",
")"
],
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:847: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n",
" FutureWarning,\n"
],
"name": "stderr"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "580bb65ee3824889b5631568b99a58c8",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=510401385.0, style=ProgressStyle(descri…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 347
},
"id": "j-t8_0pjZQiu",
"outputId": "9283797a-2d02-4b69-bc37-2e665deeaf12"
},
"source": [
"trainer.train()"
],
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"text": [
"***** Running training *****\n",
" Num examples = 6888\n",
" Num Epochs = 3\n",
" Instantaneous batch size per device = 32\n",
" Total train batch size (w. parallel, distributed & accumulation) = 32\n",
" Gradient Accumulation steps = 1\n",
" Total optimization steps = 648\n"
],
"name": "stderr"
},
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='648' max='648' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [648/648 06:08, Epoch 3/3]\n",
" </div>\n",
" <table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>Step</th>\n",
" <th>Training Loss</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>500</td>\n",
" <td>2.350000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table><p>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n",
"\n",
"Training completed. Do not forget to share your model on huggingface.co/models =)\n",
"\n",
"\n"
],
"name": "stderr"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"TrainOutput(global_step=648, training_loss=2.242282631956501, metrics={'train_runtime': 369.4134, 'train_samples_per_second': 55.937, 'train_steps_per_second': 1.754, 'total_flos': 1974853779849216.0, 'train_loss': 2.242282631956501, 'epoch': 3.0})"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jBRFaebLd9NA",
"outputId": "adf8f2f6-1fb5-4966-cb5d-413f363d1884"
},
"source": [
"trainer.save_model()"
],
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"text": [
"Saving model checkpoint to ./gpt2-robi_kobita\n",
"Configuration saved in ./gpt2-robi_kobita/config.json\n",
"Model weights saved in ./gpt2-robi_kobita/pytorch_model.bin\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "PBW6W2D-d0HJ",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "a817202f-34e6-4e4d-ec17-bde31dedae4d"
},
"source": [
"from transformers import pipeline\n",
"\n",
"poet = pipeline('text-generation',model='./gpt2-robi_kobita', tokenizer='ghosh-r/bangla-gpt2')"
],
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"text": [
"loading configuration file ./gpt2-robi_kobita/config.json\n",
"Model config GPT2Config {\n",
" \"_name_or_path\": \"ghosh-r/bangla-gpt2\",\n",
" \"activation_function\": \"gelu_new\",\n",
" \"architectures\": [\n",
" \"GPT2LMHeadModel\"\n",
" ],\n",
" \"attn_pdrop\": 0.0,\n",
" \"bos_token_id\": 50256,\n",
" \"embd_pdrop\": 0.0,\n",
" \"eos_token_id\": 50256,\n",
" \"gradient_checkpointing\": false,\n",
" \"initializer_range\": 0.02,\n",
" \"layer_norm_epsilon\": 1e-05,\n",
" \"model_type\": \"gpt2\",\n",
" \"n_ctx\": 1024,\n",
" \"n_embd\": 768,\n",
" \"n_head\": 12,\n",
" \"n_inner\": null,\n",
" \"n_layer\": 12,\n",
" \"n_positions\": 1024,\n",
" \"resid_pdrop\": 0.0,\n",
" \"scale_attn_weights\": true,\n",
" \"summary_activation\": null,\n",
" \"summary_first_dropout\": 0.1,\n",
" \"summary_proj_to_labels\": true,\n",
" \"summary_type\": \"cls_index\",\n",
" \"summary_use_proj\": true,\n",
" \"task_specific_params\": {\n",
" \"text-generation\": {\n",
" \"do_sample\": true,\n",
" \"max_length\": 50\n",
" }\n",
" },\n",
" \"torch_dtype\": \"float32\",\n",
" \"transformers_version\": \"4.8.2\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 50257\n",
"}\n",
"\n",
"loading configuration file ./gpt2-robi_kobita/config.json\n",
"Model config GPT2Config {\n",
" \"_name_or_path\": \"ghosh-r/bangla-gpt2\",\n",
" \"activation_function\": \"gelu_new\",\n",
" \"architectures\": [\n",
" \"GPT2LMHeadModel\"\n",
" ],\n",
" \"attn_pdrop\": 0.0,\n",
" \"bos_token_id\": 50256,\n",
" \"embd_pdrop\": 0.0,\n",
" \"eos_token_id\": 50256,\n",
" \"gradient_checkpointing\": false,\n",
" \"initializer_range\": 0.02,\n",
" \"layer_norm_epsilon\": 1e-05,\n",
" \"model_type\": \"gpt2\",\n",
" \"n_ctx\": 1024,\n",
" \"n_embd\": 768,\n",
" \"n_head\": 12,\n",
" \"n_inner\": null,\n",
" \"n_layer\": 12,\n",
" \"n_positions\": 1024,\n",
" \"resid_pdrop\": 0.0,\n",
" \"scale_attn_weights\": true,\n",
" \"summary_activation\": null,\n",
" \"summary_first_dropout\": 0.1,\n",
" \"summary_proj_to_labels\": true,\n",
" \"summary_type\": \"cls_index\",\n",
" \"summary_use_proj\": true,\n",
" \"task_specific_params\": {\n",
" \"text-generation\": {\n",
" \"do_sample\": true,\n",
" \"max_length\": 50\n",
" }\n",
" },\n",
" \"torch_dtype\": \"float32\",\n",
" \"transformers_version\": \"4.8.2\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 50257\n",
"}\n",
"\n",
"loading weights file ./gpt2-robi_kobita/pytorch_model.bin\n",
"All model checkpoint weights were used when initializing GPT2LMHeadModel.\n",
"\n",
"All the weights of GPT2LMHeadModel were initialized from the model checkpoint at ./gpt2-robi_kobita.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n",
"Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
"loading configuration file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/d916cda1fc7c33b497e404011afd7185469a7d6daf8d674e4bcbdb951e059768.0d65f2cde7eca11af939c04f8bd685073e094591ded3eb632f51dc0e29899bde\n",
"Model config GPT2Config {\n",
" \"_name_or_path\": \"ghosh-r/bangla-gpt2\",\n",
" \"activation_function\": \"gelu_new\",\n",
" \"architectures\": [\n",
" \"GPT2LMHeadModel\"\n",
" ],\n",
" \"attn_pdrop\": 0.0,\n",
" \"bos_token_id\": 50256,\n",
" \"embd_pdrop\": 0.0,\n",
" \"eos_token_id\": 50256,\n",
" \"gradient_checkpointing\": false,\n",
" \"initializer_range\": 0.02,\n",
" \"layer_norm_epsilon\": 1e-05,\n",
" \"model_type\": \"gpt2\",\n",
" \"n_ctx\": 1024,\n",
" \"n_embd\": 768,\n",
" \"n_head\": 12,\n",
" \"n_inner\": null,\n",
" \"n_layer\": 12,\n",
" \"n_positions\": 1024,\n",
" \"resid_pdrop\": 0.0,\n",
" \"scale_attn_weights\": true,\n",
" \"summary_activation\": null,\n",
" \"summary_first_dropout\": 0.1,\n",
" \"summary_proj_to_labels\": true,\n",
" \"summary_type\": \"cls_index\",\n",
" \"summary_use_proj\": true,\n",
" \"task_specific_params\": {\n",
" \"text-generation\": {\n",
" \"do_sample\": true,\n",
" \"max_length\": 50\n",
" }\n",
" },\n",
" \"torch_dtype\": \"float32\",\n",
" \"transformers_version\": \"4.8.2\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 50257\n",
"}\n",
"\n",
"loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/vocab.json from cache at None\n",
"loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/merges.txt from cache at None\n",
"loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/8b3d4a6f9e80e0e5494bf8f1754de822dcdc06314d0994c9b68f7dbc1735cffa.9dfff3c35c0a768eb26244f921a71293e1adaa890f1468c982bad5568d6cd623\n",
"loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/added_tokens.json from cache at None\n",
"loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/special_tokens_map.json from cache at None\n",
"loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/tokenizer_config.json from cache at None\n",
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SPzHEeeUeHJZ",
"outputId": "aba9c81d-3f0e-4882-8ec8-ae95a2ce8c99"
},
"source": [
"poem = poet('আমি তোমাকে দেখেছি মোর হৃদয়ে', max_length=200)"
],
"execution_count": 18,
"outputs": [
{
"output_type": "stream",
"text": [
"Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 69
},
"id": "oIOlTCTdxyKK",
"outputId": "c9742aca-b5d4-4b6f-bac1-869183ae4e5a"
},
"source": [
"poem[0]['generated_text']"
],
"execution_count": 19,
"outputs": [
{
"output_type": "execute_result",
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"'আমি তোমাকে দেখেছি মোর হৃদয়ের কুকুরের মতো।\\n\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0 মুহূর্তে সব নেভায়ে,\\n\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0 চিরদিন থাকে কেন আমার ঘরে?\\n\\xa0\\xa0\\xa0জীবন আমার হৃদয়ের পালঙ্কের একেবারে কাছে,\\n\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0আমার হৃদয়-সমুদ্র পারে।\\n\\xa0\\xa0\\xa0মনে হত আমার কত কত সংসারে\\n\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0কত রাত্\\u200cরির কত বিহঙ্গ যেন হয়।\\n\\xa0\\xa0মনে হত পৃথিবীর কত মা যেন'"
]
},
"metadata": {
"tags": []
},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "GMvtF1Vg2oBk"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment