Created
September 21, 2021 07:35
-
-
Save cccntu/7387724e61d915e8d6fb46b9028fe648 to your computer and use it in GitHub Desktop.
parse-c4-date-from-url.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 5, | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.8" | |
}, | |
"colab": { | |
"name": "parse-c4-date-from-url.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"include_colab_link": true | |
}, | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"c8d31cebbeeb4ddbaa0e7883f5d1fd54": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HBoxView", | |
"_dom_classes": [], | |
"_model_name": "HBoxModel", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"box_style": "", | |
"layout": "IPY_MODEL_58d455318a1d4923863c0c5eb6074218", | |
"_model_module": "@jupyter-widgets/controls", | |
"children": [ | |
"IPY_MODEL_d947e6bff04943528e354ac20aa97d38", | |
"IPY_MODEL_53643ac0a96341be8dec1d5b69b63d8f", | |
"IPY_MODEL_edd30876b2ed4e448f4fa76e529e493c" | |
] | |
} | |
}, | |
"58d455318a1d4923863c0c5eb6074218": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"d947e6bff04943528e354ac20aa97d38": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_0b392d0d3b73412ea44ff542c2da23a3", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": "100%", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_e202bb21626141aba9217c195a0e036c" | |
} | |
}, | |
"53643ac0a96341be8dec1d5b69b63d8f": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "ProgressView", | |
"style": "IPY_MODEL_280a0b1a29564f748dd88bfad7bce35b", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "FloatProgressModel", | |
"bar_style": "success", | |
"max": 1, | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": 1, | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"orientation": "horizontal", | |
"min": 0, | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_4fced6bc1a0d4bd8bf27ef22172a2e27" | |
} | |
}, | |
"edd30876b2ed4e448f4fa76e529e493c": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_abe2de1f80474ade996bbeead3004fb4", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": " 1/1 [00:01<00:00, 1.10s/it]", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_3bd2e03b895044c9932f5d9c775231c4" | |
} | |
}, | |
"0b392d0d3b73412ea44ff542c2da23a3": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"e202bb21626141aba9217c195a0e036c": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"280a0b1a29564f748dd88bfad7bce35b": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "ProgressStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"bar_color": null, | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"4fced6bc1a0d4bd8bf27ef22172a2e27": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"abe2de1f80474ade996bbeead3004fb4": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"3bd2e03b895044c9932f5d9c775231c4": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"b896fc2ef3c1449dab71e73755050a53": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HBoxView", | |
"_dom_classes": [], | |
"_model_name": "HBoxModel", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"box_style": "", | |
"layout": "IPY_MODEL_027ef8ff9cd441dd86159909fdd92a95", | |
"_model_module": "@jupyter-widgets/controls", | |
"children": [ | |
"IPY_MODEL_d03660e7c0364cc28652568b9031cf3a", | |
"IPY_MODEL_0b7431c9df444f5c92046b5b5b6bcb16", | |
"IPY_MODEL_3fcba8cc9da14d49a286cdb7e00185e8" | |
] | |
} | |
}, | |
"027ef8ff9cd441dd86159909fdd92a95": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"d03660e7c0364cc28652568b9031cf3a": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_f11fee74c0a843f2ad0e32f2a9c1acca", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": "convert url to date: 3%", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_379dfd79d8264851a510be3541fb4365" | |
} | |
}, | |
"0b7431c9df444f5c92046b5b5b6bcb16": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "ProgressView", | |
"style": "IPY_MODEL_6946565271664a5f9f07df351966b75e", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "FloatProgressModel", | |
"bar_style": "", | |
"max": 13799838, | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": 367442, | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"orientation": "horizontal", | |
"min": 0, | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_67d11f086ea54d509ee9fbbaffe04526" | |
} | |
}, | |
"3fcba8cc9da14d49a286cdb7e00185e8": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_d78418f706394df18d5b87371d6f8a5c", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": " 367102/13799838 [01:57<1:06:10, 3383.54ex/s]", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_c1102206267d489b9fae8df35d5c65d7" | |
} | |
}, | |
"f11fee74c0a843f2ad0e32f2a9c1acca": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"379dfd79d8264851a510be3541fb4365": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"6946565271664a5f9f07df351966b75e": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "ProgressStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"bar_color": null, | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"67d11f086ea54d509ee9fbbaffe04526": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"d78418f706394df18d5b87371d6f8a5c": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"c1102206267d489b9fae8df35d5c65d7": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
} | |
} | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/cccntu/7387724e61d915e8d6fb46b9028fe648/parse-c4-date-from-url.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "DYPXhTLWQmxD", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "fd68f138-3954-494b-e5d1-a984ac0659f3" | |
}, | |
"source": [ | |
"!pip install -q datasets" | |
], | |
"id": "DYPXhTLWQmxD", | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"\u001b[K |████████████████████████████████| 270 kB 5.4 MB/s \n", | |
"\u001b[K |████████████████████████████████| 52 kB 1.4 MB/s \n", | |
"\u001b[K |████████████████████████████████| 119 kB 48.7 MB/s \n", | |
"\u001b[K |████████████████████████████████| 1.3 MB 39.7 MB/s \n", | |
"\u001b[K |████████████████████████████████| 243 kB 52.7 MB/s \n", | |
"\u001b[K |████████████████████████████████| 294 kB 57.8 MB/s \n", | |
"\u001b[K |████████████████████████████████| 142 kB 59.2 MB/s \n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "jobi6-WJRprV", | |
"outputId": "c94778a1-58aa-4629-e37b-a44a584b8882" | |
}, | |
"source": [ | |
"!git clone https://github.com/cccntu/dateutil" | |
], | |
"id": "jobi6-WJRprV", | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Cloning into 'dateutil'...\n", | |
"remote: Enumerating objects: 6543, done.\u001b[K\n", | |
"remote: Counting objects: 100% (242/242), done.\u001b[K\n", | |
"remote: Compressing objects: 100% (157/157), done.\u001b[K\n", | |
"remote: Total 6543 (delta 113), reused 170 (delta 77), pack-reused 6301\u001b[K\n", | |
"Receiving objects: 100% (6543/6543), 5.85 MiB | 19.38 MiB/s, done.\n", | |
"Resolving deltas: 100% (4211/4211), done.\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "nPL4coMgR1Dq", | |
"outputId": "d008cee4-075b-49f3-84b2-723f170e8ec6" | |
}, | |
"source": [ | |
"!cd dateutil && git checkout date-only-282 && git pull && cd .." | |
], | |
"id": "nPL4coMgR1Dq", | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Branch 'date-only-282' set up to track remote branch 'date-only-282' from 'origin'.\n", | |
"Switched to a new branch 'date-only-282'\n", | |
"Already up to date.\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "s3AE3ajFharR", | |
"outputId": "2a633b91-ba9f-4b4c-939c-bde00738e3db" | |
}, | |
"source": [ | |
"!pip uninstall -y python-dateutil" | |
], | |
"id": "s3AE3ajFharR", | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Found existing installation: python-dateutil 2.8.2\n", | |
"Uninstalling python-dateutil-2.8.2:\n", | |
" Successfully uninstalled python-dateutil-2.8.2\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "sMFxpNxfQ-6q", | |
"outputId": "2f03c183-a099-470a-f9fc-6dd7906bb078" | |
}, | |
"source": [ | |
"!pip install -e dateutil" | |
], | |
"id": "sMFxpNxfQ-6q", | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Obtaining file:///content/dateutil\n", | |
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | |
" Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", | |
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil==2.8.3.dev1+g697ca9a) (1.15.0)\n", | |
"Installing collected packages: python-dateutil\n", | |
" Attempting uninstall: python-dateutil\n", | |
" Found existing installation: python-dateutil 2.8.2\n", | |
" Uninstalling python-dateutil-2.8.2:\n", | |
" Successfully uninstalled python-dateutil-2.8.2\n", | |
" Running setup.py develop for python-dateutil\n", | |
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", | |
"albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n", | |
"Successfully installed python-dateutil-2.8.3.dev1+g697ca9a\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "DHsXeGUqTLmV" | |
}, | |
"source": [ | |
"# restart the runtime to use the new dateutil version " | |
], | |
"id": "DHsXeGUqTLmV", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "5d8vUrryS8l-", | |
"outputId": "7df4bc21-222a-4901-fdcc-79aa5918a594" | |
}, | |
"source": [ | |
"# make sure this cell runs without error\n", | |
"from dateutil.parser import parser\n", | |
"parse('/2021/1/1/some text', fuzzy=True, date_only=True)" | |
], | |
"id": "5d8vUrryS8l-", | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"datetime.datetime(2021, 1, 1, 0, 0)" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 1 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 83, | |
"referenced_widgets": [ | |
"c8d31cebbeeb4ddbaa0e7883f5d1fd54", | |
"58d455318a1d4923863c0c5eb6074218", | |
"d947e6bff04943528e354ac20aa97d38", | |
"53643ac0a96341be8dec1d5b69b63d8f", | |
"edd30876b2ed4e448f4fa76e529e493c", | |
"0b392d0d3b73412ea44ff542c2da23a3", | |
"e202bb21626141aba9217c195a0e036c", | |
"280a0b1a29564f748dd88bfad7bce35b", | |
"4fced6bc1a0d4bd8bf27ef22172a2e27", | |
"abe2de1f80474ade996bbeead3004fb4", | |
"3bd2e03b895044c9932f5d9c775231c4" | |
] | |
}, | |
"id": "vz8mDExG5eR7", | |
"outputId": "87a1ba48-0457-440a-b9d2-160fda8e0a9e" | |
}, | |
"source": [ | |
"from datasets import load_dataset\n", | |
"\n", | |
"# this is 'c4', 'newslike' subset, I pre-processed it to keep only the url field, \n", | |
"# \n", | |
"dataset = load_dataset(\"bs-modeling-metadata/c4_newslike_url_only\", keep_in_memory=True)" | |
], | |
"id": "vz8mDExG5eR7", | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"Using custom data configuration c4_newslike_url_only-4ac73b9230e356e4\n", | |
"Reusing dataset csv (/root/.cache/huggingface/datasets/csv/c4_newslike_url_only-4ac73b9230e356e4/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "c8d31cebbeeb4ddbaa0e7883f5d1fd54", | |
"version_minor": 0, | |
"version_major": 2 | |
}, | |
"text/plain": [ | |
" 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "5eR9vTdj5jsl" | |
}, | |
"source": [ | |
"\n", | |
"from urllib.parse import urlsplit, unquote_plus\n", | |
"def get_path_from_url(url):\n", | |
" parts = urlsplit(url)\n", | |
" return unquote_plus(parts.path)" | |
], | |
"id": "5eR9vTdj5jsl", | |
"execution_count": 8, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "778a75315tb5", | |
"outputId": "85d5b875-4ffa-4634-c911-00776c8b3c55" | |
}, | |
"source": [ | |
"# I modified the source and installed locally, such that it only parse date\n", | |
"# otherwise it can fail if it fails to parse time, even if date can be parsed.\n", | |
"# This way I can more dates parsed\n", | |
"from dateutil.parser import parser, parse, ParserError\n", | |
"\n", | |
"# _parser = parser()\n", | |
"# _parser.parse(datelike_paths[i], fuzzy=True)\n", | |
"def parse_date(path):\n", | |
" try:\n", | |
" return parse(path, fuzzy=True, date_only=True)\n", | |
" except ParserError:\n", | |
" return None\n", | |
" except OverflowError:\n", | |
" # this happens, I don't know why, just ignore it\n", | |
" return None\n", | |
"\n", | |
"\n", | |
"def remove_improbable_date(x):\n", | |
" if x is not None and (x.year < 1983 or x.year > 2021):\n", | |
" return None\n", | |
" return x\n", | |
"\n", | |
"\n", | |
"parse_date(\"/2021/2/1/some name some number 123, 12\")" | |
], | |
"id": "778a75315tb5", | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"datetime.datetime(2021, 2, 1, 0, 0)" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 9 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gkczdRJS59mk" | |
}, | |
"source": [ | |
"\n", | |
"def map_dataset_fn(example):\n", | |
" url = example['url']\n", | |
" path = get_path_from_url(url)\n", | |
" date = parse_date(path)\n", | |
" date = remove_improbable_date(date)\n", | |
" date = str(date) if date is not None else ''\n", | |
" return {'date':date}" | |
], | |
"id": "gkczdRJS59mk", | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "GsLljBHIPy_w" | |
}, | |
"source": [ | |
"\n", | |
"ds = dataset['train']" | |
], | |
"id": "GsLljBHIPy_w", | |
"execution_count": 11, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Mo12nLtlRaep" | |
}, | |
"source": [ | |
"# ~3,000 examples/second on colab" | |
], | |
"id": "Mo12nLtlRaep", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 49, | |
"referenced_widgets": [ | |
"b896fc2ef3c1449dab71e73755050a53", | |
"027ef8ff9cd441dd86159909fdd92a95", | |
"d03660e7c0364cc28652568b9031cf3a", | |
"0b7431c9df444f5c92046b5b5b6bcb16", | |
"3fcba8cc9da14d49a286cdb7e00185e8", | |
"f11fee74c0a843f2ad0e32f2a9c1acca", | |
"379dfd79d8264851a510be3541fb4365", | |
"6946565271664a5f9f07df351966b75e", | |
"67d11f086ea54d509ee9fbbaffe04526", | |
"d78418f706394df18d5b87371d6f8a5c", | |
"c1102206267d489b9fae8df35d5c65d7" | |
] | |
}, | |
"id": "S-VA02_dJ24U", | |
"outputId": "ad5861ec-de05-43cf-c034-7449a86b5d23" | |
}, | |
"source": [ | |
"ds = ds.map(map_dataset_fn,desc='convert url to date', keep_in_memory=True)" | |
], | |
"id": "S-VA02_dJ24U", | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "b896fc2ef3c1449dab71e73755050a53", | |
"version_minor": 0, | |
"version_major": 2 | |
}, | |
"text/plain": [ | |
"convert url to date: 0%| | 0/13799838 [00:00<?, ?ex/s]" | |
] | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "UcRRn3maRmNI" | |
}, | |
"source": [ | |
"" | |
], | |
"id": "UcRRn3maRmNI", | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment