Skip to content

Instantly share code, notes, and snippets.

@avidale
Created July 23, 2021 21:01
Show Gist options
  • Save avidale/05797b4c3d437f830d261d3c36fe9801 to your computer and use it in GitHub Desktop.
Save avidale/05797b4c3d437f830d261d3c36fe9801 to your computer and use it in GitHub Desktop.
phrase_similarity
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "phrase_similarity",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyMKHOz7OWF5NjENryTnAfBJ",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"496dbe2c95164c07ad997f2461355374": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_793ad4e7f01a4513ba6817d6e1719520",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_725b78b22b2749c1833416d5390244ed",
"IPY_MODEL_add2853762f240ab988fe4d2a0862471"
]
}
},
"793ad4e7f01a4513ba6817d6e1719520": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"725b78b22b2749c1833416d5390244ed": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_2cdbd719a479499592bb4b41052c0aaa",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 341,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 341,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_18f3de4fc41849d8971c65fac38ee227"
}
},
"add2853762f240ab988fe4d2a0862471": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_f64f635163614358a8a474cdae0963db",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 341/341 [00:01<00:00, 176B/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_55dc263fa2dc44e6a90362d532bcbe51"
}
},
"2cdbd719a479499592bb4b41052c0aaa": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"18f3de4fc41849d8971c65fac38ee227": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"f64f635163614358a8a474cdae0963db": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"55dc263fa2dc44e6a90362d532bcbe51": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9d91d7baefdd41b6b2f05e57835ca945": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_950bd4907dae4f8c9c3095fa4e8deb08",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_c2aaa5de969a4bca909268af2418a75e",
"IPY_MODEL_94bfb0d1f2104916948d5087cf1350c3"
]
}
},
"950bd4907dae4f8c9c3095fa4e8deb08": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"c2aaa5de969a4bca909268af2418a75e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_e10a5719c33b4ce586c4043177dbd40a",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 241082,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 241082,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_9314409a2cb640599ef8a6a0b0644e50"
}
},
"94bfb0d1f2104916948d5087cf1350c3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_da0e5135bb274b3b847e76db0d53d879",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 241k/241k [00:00<00:00, 331kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_b7f5544b4f634cf097a4440713d4f59d"
}
},
"e10a5719c33b4ce586c4043177dbd40a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"9314409a2cb640599ef8a6a0b0644e50": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"da0e5135bb274b3b847e76db0d53d879": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"b7f5544b4f634cf097a4440713d4f59d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"953dc956581c414785cf6f013c04d3cd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_ffc6f49f6e6442a1893eb76ad3bf5763",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_282abfc811cf4c45ab5d3656d11f145d",
"IPY_MODEL_a9dedb118e3042a3b2135e814eca745f"
]
}
},
"ffc6f49f6e6442a1893eb76ad3bf5763": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"282abfc811cf4c45ab5d3656d11f145d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_8654dafe535d4b21bee229f32ba3dc0f",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 468145,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 468145,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_d220aaff9c58407a9c43f959fc2f0073"
}
},
"a9dedb118e3042a3b2135e814eca745f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_c2658d7fc0c74605abe05b4b78049b8c",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 468k/468k [00:00<00:00, 2.41MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c3b064231a8942cdb3bd43c52051b173"
}
},
"8654dafe535d4b21bee229f32ba3dc0f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"d220aaff9c58407a9c43f959fc2f0073": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"c2658d7fc0c74605abe05b4b78049b8c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"c3b064231a8942cdb3bd43c52051b173": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"6cee4d068c99493d9d699ccfff14ffd3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_f515c8ad3a1d45f0a383789498b0a86c",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_882c48e326564a138431742eb4ffbed9",
"IPY_MODEL_1ad679348783482e829dbfe42a46f6a2"
]
}
},
"f515c8ad3a1d45f0a383789498b0a86c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"882c48e326564a138431742eb4ffbed9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_a35598b98ef94b4d9114f8d967738e11",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 112,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 112,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_469c152766c14b0a8a0b3820d407d711"
}
},
"1ad679348783482e829dbfe42a46f6a2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_6550a0c25d7a420aaf08aa1b1fd7fcad",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 112/112 [00:27<00:00, 4.11B/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_6e04c77b213b4401b5044c60a806ff79"
}
},
"a35598b98ef94b4d9114f8d967738e11": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"469c152766c14b0a8a0b3820d407d711": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"6550a0c25d7a420aaf08aa1b1fd7fcad": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"6e04c77b213b4401b5044c60a806ff79": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/avidale/05797b4c3d437f830d261d3c36fe9801/phrase_similarity.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "B8LRh0gmHR3N"
},
"source": [
"Этот блокнот показывает, как выполнить классификацию коротких текстов по небольшому числу примеров, используя предобученную нейросеть. \n",
"\n",
"Работает это так:\n",
"1. Нейросеть переводит каждый текст в вектор. Она [обучалась](https://habr.com/ru/post/562064/) это делать так, что у текстов, похожих по смыслу, и векторы похожие. \n",
"2. Мы сравниваем новый текст с векторами текстов-примеров, и для каждого интента выводим максимальное векторное сходство нового текста с примерами в составе этого интента. "
]
},
{
"cell_type": "code",
"metadata": {
"id": "s53dL0rnFuWv"
},
"source": [
"!pip install sentencepiece transformers"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ASBhOfcoGBbq"
},
"source": [
"Выбираем модель, которую хотим использовать. \n",
"\n",
"https://huggingface.co/cointegrated/rubert-tiny - туповатая, но очень быстрая\n",
"\n",
"https://huggingface.co/cointegrated/LaBSE-en-ru - умнее, но больше и медленнее"
]
},
{
"cell_type": "code",
"metadata": {
"id": "iRbYqdesFyQL"
},
"source": [
"MODEL_NAME = 'cointegrated/rubert-tiny'"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "tKv-X5cBHPc6"
},
"source": [
"Скачиваем модель"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 284,
"referenced_widgets": [
"496dbe2c95164c07ad997f2461355374",
"793ad4e7f01a4513ba6817d6e1719520",
"725b78b22b2749c1833416d5390244ed",
"add2853762f240ab988fe4d2a0862471",
"2cdbd719a479499592bb4b41052c0aaa",
"18f3de4fc41849d8971c65fac38ee227",
"f64f635163614358a8a474cdae0963db",
"55dc263fa2dc44e6a90362d532bcbe51",
"9d91d7baefdd41b6b2f05e57835ca945",
"950bd4907dae4f8c9c3095fa4e8deb08",
"c2aaa5de969a4bca909268af2418a75e",
"94bfb0d1f2104916948d5087cf1350c3",
"e10a5719c33b4ce586c4043177dbd40a",
"9314409a2cb640599ef8a6a0b0644e50",
"da0e5135bb274b3b847e76db0d53d879",
"b7f5544b4f634cf097a4440713d4f59d",
"953dc956581c414785cf6f013c04d3cd",
"ffc6f49f6e6442a1893eb76ad3bf5763",
"282abfc811cf4c45ab5d3656d11f145d",
"a9dedb118e3042a3b2135e814eca745f",
"8654dafe535d4b21bee229f32ba3dc0f",
"d220aaff9c58407a9c43f959fc2f0073",
"c2658d7fc0c74605abe05b4b78049b8c",
"c3b064231a8942cdb3bd43c52051b173",
"6cee4d068c99493d9d699ccfff14ffd3",
"f515c8ad3a1d45f0a383789498b0a86c",
"882c48e326564a138431742eb4ffbed9",
"1ad679348783482e829dbfe42a46f6a2",
"a35598b98ef94b4d9114f8d967738e11",
"469c152766c14b0a8a0b3820d407d711",
"6550a0c25d7a420aaf08aa1b1fd7fcad",
"6e04c77b213b4401b5044c60a806ff79"
]
},
"id": "xLiyRSAHGf0n",
"outputId": "daf4b292-34a0-4fc5-9a9a-bc254ced735e"
},
"source": [
"from transformers import AutoModel, AutoTokenizer\n",
"model = AutoModel.from_pretrained(MODEL_NAME)\n",
"tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']\n",
"- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
],
"name": "stderr"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "496dbe2c95164c07ad997f2461355374",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=341.0, style=ProgressStyle(description_…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9d91d7baefdd41b6b2f05e57835ca945",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=241082.0, style=ProgressStyle(descripti…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "953dc956581c414785cf6f013c04d3cd",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=468145.0, style=ProgressStyle(descripti…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6cee4d068c99493d9d699ccfff14ffd3",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-IFz2p2rJPxo"
},
"source": [
"Пишем функцию для перевода текста в числовой вектор. Главная магия - тут!"
]
},
{
"cell_type": "code",
"metadata": {
"id": "yMAQvzPAGAZt"
},
"source": [
"import numpy as np\n",
"import torch\n",
"\n",
"def embed_bert_cls(text, model, tokenizer):\n",
" t = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')\n",
" t = {k: v.to(model.device) for k, v in t.items()}\n",
" with torch.no_grad():\n",
" model_output = model(**t)\n",
" embeddings = model_output.last_hidden_state[:, 0, :]\n",
" embeddings = torch.nn.functional.normalize(embeddings)\n",
" return embeddings[0].cpu().numpy()"
],
"execution_count": 6,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "GcFQRLlNJTs5"
},
"source": [
"Записываем наш словарик текстов из разных интентов, которые мы хотим уметь определять"
]
},
{
"cell_type": "code",
"metadata": {
"id": "yfVuclAJGRem"
},
"source": [
"intents = {\n",
" 'how_are_you': ['как дела', 'как поживаешь'],\n",
" 'toast': ['я поднимаю стакан', 'пью до дна', 'за ваше здоровье'],\n",
" 'music': ['включи музыку', 'вруби шансон', 'исполните мне пожалуйста симфонию'],\n",
"}"
],
"execution_count": 7,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "S19MNrDGJZZy"
},
"source": [
"Переводим все эти тексты в векторы, чтобы с ними сравниваться"
]
},
{
"cell_type": "code",
"metadata": {
"id": "rHX23QcCHEpH"
},
"source": [
"example_vectors = []\n",
"intent_names = []\n",
"for intent, texts in intents.items():\n",
" for text in texts:\n",
" example_vectors.append(embed_bert_cls(text, model, tokenizer))\n",
" intent_names.append(intent)\n",
"example_vectors = np.stack(example_vectors)"
],
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XNFOkm6pJqjt",
"outputId": "045200bd-b3f4-48b9-f5cc-2755fbe96fc3"
},
"source": [
"example_vectors.shape"
],
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(8, 312)"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jqyucHDxJel8"
},
"source": [
"Пишем функцию для сравнения текста с обучающими примерами"
]
},
{
"cell_type": "code",
"metadata": {
"id": "UnDHiRHxIxq9"
},
"source": [
"from collections import Counter\n",
"\n",
"def classify_text(text):\n",
" vector = embed_bert_cls(text, model, tokenizer)\n",
" scores = np.dot(example_vectors, vector)\n",
" result = Counter()\n",
" for score, intent in zip(scores, intent_names):\n",
" result[intent] = max(result[intent], score)\n",
" return result"
],
"execution_count": 11,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "S4GeMapYK1iu"
},
"source": [
"Теперь можно вписывать любой свой текст и смотреть на оценки сходства с каждым интентом"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ggw0D_ujJ5Uj",
"outputId": "1bfb8e63-6c82-40f1-8b2a-457174aa7a9f"
},
"source": [
"text = 'стакан я поднимаю'\n",
"print(classify_text(text).most_common())"
],
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"text": [
"[('toast', 0.9790392), ('music', 0.6900815), ('how_are_you', 0.65599346)]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6qSdfBl8KSO1",
"outputId": "c8b33e13-043d-42f5-acd0-c1d0aaa2b645"
},
"source": [
"text = 'подымаю чашу!'\n",
"print(classify_text(text).most_common())"
],
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"text": [
"[('toast', 0.78018653), ('how_are_you', 0.72003293), ('music', 0.6865895)]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "w12XjvMbK7Z0",
"outputId": "1caf4fbd-935f-4fde-fdb1-c165b580c10a"
},
"source": [
"text = 'как пройти в библиотеку?'\n",
"print(classify_text(text).most_common())"
],
"execution_count": 20,
"outputs": [
{
"output_type": "stream",
"text": [
"[('how_are_you', 0.66733), ('toast', 0.52488065), ('music', 0.51675415)]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Bwu12Sb-KJeZ"
},
"source": [
"Можно замерить время выполнения функции, это 10-15 мс"
]
},
{
"cell_type": "code",
"metadata": {
"id": "XgnSgnyDJ_S5",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "8716fb26-9347-4d31-8fd0-acebcf6c396b"
},
"source": [
"%%time\n",
"text = 'стакан я поднимаю'\n",
"\n",
"print(classify_text(text).most_common())"
],
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"text": [
"[('toast', 0.9790392), ('music', 0.6900815), ('how_are_you', 0.65599346)]\n",
"CPU times: user 7.92 ms, sys: 843 µs, total: 8.76 ms\n",
"Wall time: 11.2 ms\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "zVcduq-PKIrC"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment