Skip to content

Instantly share code, notes, and snippets.

@cccntu
Created September 21, 2021 07:35
Show Gist options
  • Save cccntu/7387724e61d915e8d6fb46b9028fe648 to your computer and use it in GitHub Desktop.
Save cccntu/7387724e61d915e8d6fb46b9028fe648 to your computer and use it in GitHub Desktop.
parse-c4-date-from-url.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 5,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"colab": {
"name": "parse-c4-date-from-url.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"c8d31cebbeeb4ddbaa0e7883f5d1fd54": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_58d455318a1d4923863c0c5eb6074218",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_d947e6bff04943528e354ac20aa97d38",
"IPY_MODEL_53643ac0a96341be8dec1d5b69b63d8f",
"IPY_MODEL_edd30876b2ed4e448f4fa76e529e493c"
]
}
},
"58d455318a1d4923863c0c5eb6074218": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"d947e6bff04943528e354ac20aa97d38": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_0b392d0d3b73412ea44ff542c2da23a3",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_e202bb21626141aba9217c195a0e036c"
}
},
"53643ac0a96341be8dec1d5b69b63d8f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_280a0b1a29564f748dd88bfad7bce35b",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_4fced6bc1a0d4bd8bf27ef22172a2e27"
}
},
"edd30876b2ed4e448f4fa76e529e493c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_abe2de1f80474ade996bbeead3004fb4",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1/1 [00:01<00:00, 1.10s/it]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_3bd2e03b895044c9932f5d9c775231c4"
}
},
"0b392d0d3b73412ea44ff542c2da23a3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"e202bb21626141aba9217c195a0e036c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"280a0b1a29564f748dd88bfad7bce35b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"4fced6bc1a0d4bd8bf27ef22172a2e27": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"abe2de1f80474ade996bbeead3004fb4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"3bd2e03b895044c9932f5d9c775231c4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b896fc2ef3c1449dab71e73755050a53": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_027ef8ff9cd441dd86159909fdd92a95",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_d03660e7c0364cc28652568b9031cf3a",
"IPY_MODEL_0b7431c9df444f5c92046b5b5b6bcb16",
"IPY_MODEL_3fcba8cc9da14d49a286cdb7e00185e8"
]
}
},
"027ef8ff9cd441dd86159909fdd92a95": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"d03660e7c0364cc28652568b9031cf3a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_f11fee74c0a843f2ad0e32f2a9c1acca",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "convert url to date: 3%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_379dfd79d8264851a510be3541fb4365"
}
},
"0b7431c9df444f5c92046b5b5b6bcb16": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_6946565271664a5f9f07df351966b75e",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "",
"max": 13799838,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 367442,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_67d11f086ea54d509ee9fbbaffe04526"
}
},
"3fcba8cc9da14d49a286cdb7e00185e8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_d78418f706394df18d5b87371d6f8a5c",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 367102/13799838 [01:57<1:06:10, 3383.54ex/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c1102206267d489b9fae8df35d5c65d7"
}
},
"f11fee74c0a843f2ad0e32f2a9c1acca": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"379dfd79d8264851a510be3541fb4365": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"6946565271664a5f9f07df351966b75e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"67d11f086ea54d509ee9fbbaffe04526": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"d78418f706394df18d5b87371d6f8a5c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"c1102206267d489b9fae8df35d5c65d7": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/cccntu/7387724e61d915e8d6fb46b9028fe648/parse-c4-date-from-url.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "DYPXhTLWQmxD",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "fd68f138-3954-494b-e5d1-a984ac0659f3"
},
"source": [
"!pip install -q datasets"
],
"id": "DYPXhTLWQmxD",
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[K |████████████████████████████████| 270 kB 5.4 MB/s \n",
"\u001b[K |████████████████████████████████| 52 kB 1.4 MB/s \n",
"\u001b[K |████████████████████████████████| 119 kB 48.7 MB/s \n",
"\u001b[K |████████████████████████████████| 1.3 MB 39.7 MB/s \n",
"\u001b[K |████████████████████████████████| 243 kB 52.7 MB/s \n",
"\u001b[K |████████████████████████████████| 294 kB 57.8 MB/s \n",
"\u001b[K |████████████████████████████████| 142 kB 59.2 MB/s \n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jobi6-WJRprV",
"outputId": "c94778a1-58aa-4629-e37b-a44a584b8882"
},
"source": [
"!git clone https://github.com/cccntu/dateutil"
],
"id": "jobi6-WJRprV",
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Cloning into 'dateutil'...\n",
"remote: Enumerating objects: 6543, done.\u001b[K\n",
"remote: Counting objects: 100% (242/242), done.\u001b[K\n",
"remote: Compressing objects: 100% (157/157), done.\u001b[K\n",
"remote: Total 6543 (delta 113), reused 170 (delta 77), pack-reused 6301\u001b[K\n",
"Receiving objects: 100% (6543/6543), 5.85 MiB | 19.38 MiB/s, done.\n",
"Resolving deltas: 100% (4211/4211), done.\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nPL4coMgR1Dq",
"outputId": "d008cee4-075b-49f3-84b2-723f170e8ec6"
},
"source": [
"!cd dateutil && git checkout date-only-282 && git pull && cd .."
],
"id": "nPL4coMgR1Dq",
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Branch 'date-only-282' set up to track remote branch 'date-only-282' from 'origin'.\n",
"Switched to a new branch 'date-only-282'\n",
"Already up to date.\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "s3AE3ajFharR",
"outputId": "2a633b91-ba9f-4b4c-939c-bde00738e3db"
},
"source": [
"!pip uninstall -y python-dateutil"
],
"id": "s3AE3ajFharR",
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Found existing installation: python-dateutil 2.8.2\n",
"Uninstalling python-dateutil-2.8.2:\n",
" Successfully uninstalled python-dateutil-2.8.2\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "sMFxpNxfQ-6q",
"outputId": "2f03c183-a099-470a-f9fc-6dd7906bb078"
},
"source": [
"!pip install -e dateutil"
],
"id": "sMFxpNxfQ-6q",
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Obtaining file:///content/dateutil\n",
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil==2.8.3.dev1+g697ca9a) (1.15.0)\n",
"Installing collected packages: python-dateutil\n",
" Attempting uninstall: python-dateutil\n",
" Found existing installation: python-dateutil 2.8.2\n",
" Uninstalling python-dateutil-2.8.2:\n",
" Successfully uninstalled python-dateutil-2.8.2\n",
" Running setup.py develop for python-dateutil\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n",
"Successfully installed python-dateutil-2.8.3.dev1+g697ca9a\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "DHsXeGUqTLmV"
},
"source": [
"# restart the runtime to use the new dateutil version "
],
"id": "DHsXeGUqTLmV",
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5d8vUrryS8l-",
"outputId": "7df4bc21-222a-4901-fdcc-79aa5918a594"
},
"source": [
"# make sure this cell runs without error\n",
"from dateutil.parser import parser\n",
"parse('/2021/1/1/some text', fuzzy=True, date_only=True)"
],
"id": "5d8vUrryS8l-",
"execution_count": 1,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"datetime.datetime(2021, 1, 1, 0, 0)"
]
},
"metadata": {},
"execution_count": 1
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 83,
"referenced_widgets": [
"c8d31cebbeeb4ddbaa0e7883f5d1fd54",
"58d455318a1d4923863c0c5eb6074218",
"d947e6bff04943528e354ac20aa97d38",
"53643ac0a96341be8dec1d5b69b63d8f",
"edd30876b2ed4e448f4fa76e529e493c",
"0b392d0d3b73412ea44ff542c2da23a3",
"e202bb21626141aba9217c195a0e036c",
"280a0b1a29564f748dd88bfad7bce35b",
"4fced6bc1a0d4bd8bf27ef22172a2e27",
"abe2de1f80474ade996bbeead3004fb4",
"3bd2e03b895044c9932f5d9c775231c4"
]
},
"id": "vz8mDExG5eR7",
"outputId": "87a1ba48-0457-440a-b9d2-160fda8e0a9e"
},
"source": [
"from datasets import load_dataset\n",
"\n",
"# this is 'c4', 'newslike' subset, I pre-processed it to keep only the url field, \n",
"# \n",
"dataset = load_dataset(\"bs-modeling-metadata/c4_newslike_url_only\", keep_in_memory=True)"
],
"id": "vz8mDExG5eR7",
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Using custom data configuration c4_newslike_url_only-4ac73b9230e356e4\n",
"Reusing dataset csv (/root/.cache/huggingface/datasets/csv/c4_newslike_url_only-4ac73b9230e356e4/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)\n"
]
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c8d31cebbeeb4ddbaa0e7883f5d1fd54",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "5eR9vTdj5jsl"
},
"source": [
"\n",
"from urllib.parse import urlsplit, unquote_plus\n",
"def get_path_from_url(url):\n",
" parts = urlsplit(url)\n",
" return unquote_plus(parts.path)"
],
"id": "5eR9vTdj5jsl",
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "778a75315tb5",
"outputId": "85d5b875-4ffa-4634-c911-00776c8b3c55"
},
"source": [
"# I modified the source and installed locally, such that it only parse date\n",
"# otherwise it can fail if it fails to parse time, even if date can be parsed.\n",
"# This way I can more dates parsed\n",
"from dateutil.parser import parser, parse, ParserError\n",
"\n",
"# _parser = parser()\n",
"# _parser.parse(datelike_paths[i], fuzzy=True)\n",
"def parse_date(path):\n",
" try:\n",
" return parse(path, fuzzy=True, date_only=True)\n",
" except ParserError:\n",
" return None\n",
" except OverflowError:\n",
" # this happens, I don't know why, just ignore it\n",
" return None\n",
"\n",
"\n",
"def remove_improbable_date(x):\n",
" if x is not None and (x.year < 1983 or x.year > 2021):\n",
" return None\n",
" return x\n",
"\n",
"\n",
"parse_date(\"/2021/2/1/some name some number 123, 12\")"
],
"id": "778a75315tb5",
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"datetime.datetime(2021, 2, 1, 0, 0)"
]
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "gkczdRJS59mk"
},
"source": [
"\n",
"def map_dataset_fn(example):\n",
" url = example['url']\n",
" path = get_path_from_url(url)\n",
" date = parse_date(path)\n",
" date = remove_improbable_date(date)\n",
" date = str(date) if date is not None else ''\n",
" return {'date':date}"
],
"id": "gkczdRJS59mk",
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "GsLljBHIPy_w"
},
"source": [
"\n",
"ds = dataset['train']"
],
"id": "GsLljBHIPy_w",
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Mo12nLtlRaep"
},
"source": [
"# ~3,000 examples/second on colab"
],
"id": "Mo12nLtlRaep",
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 49,
"referenced_widgets": [
"b896fc2ef3c1449dab71e73755050a53",
"027ef8ff9cd441dd86159909fdd92a95",
"d03660e7c0364cc28652568b9031cf3a",
"0b7431c9df444f5c92046b5b5b6bcb16",
"3fcba8cc9da14d49a286cdb7e00185e8",
"f11fee74c0a843f2ad0e32f2a9c1acca",
"379dfd79d8264851a510be3541fb4365",
"6946565271664a5f9f07df351966b75e",
"67d11f086ea54d509ee9fbbaffe04526",
"d78418f706394df18d5b87371d6f8a5c",
"c1102206267d489b9fae8df35d5c65d7"
]
},
"id": "S-VA02_dJ24U",
"outputId": "ad5861ec-de05-43cf-c034-7449a86b5d23"
},
"source": [
"ds = ds.map(map_dataset_fn,desc='convert url to date', keep_in_memory=True)"
],
"id": "S-VA02_dJ24U",
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b896fc2ef3c1449dab71e73755050a53",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"convert url to date: 0%| | 0/13799838 [00:00<?, ?ex/s]"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "UcRRn3maRmNI"
},
"source": [
""
],
"id": "UcRRn3maRmNI",
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment