Skip to content

Instantly share code, notes, and snippets.

@xiaoouwang
Created February 11, 2021 11:04
Show Gist options
  • Save xiaoouwang/5168fd2e16090db97fe232577eeb1dc7 to your computer and use it in GitHub Desktop.
Save xiaoouwang/5168fd2e16090db97fe232577eeb1dc7 to your computer and use it in GitHub Desktop.
a minimal example of word vector in gensim
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-02-11T10:46:28.455821Z",
"start_time": "2021-02-11T10:46:28.452645Z"
},
"trusted": true
},
"id": "important-tractor",
"cell_type": "code",
"source": "from nltk.tokenize import sent_tokenize, word_tokenize \nimport gensim \nfrom gensim.models import Word2Vec\nimport warnings \nwarnings.filterwarnings(action = 'ignore') \nimport urllib.request",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-02-11T10:54:35.389258Z",
"start_time": "2021-02-11T10:54:34.190907Z"
},
"scrolled": false,
"trusted": true
},
"id": "latin-vietnamese",
"cell_type": "code",
"source": "url = \"http://www.gutenberg.org/files/11/11-0.txt\"\ntext = urllib.request.urlopen(url).read().decode()\ntext.replace(\"\\n\",\" \")",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true,
"ExecuteTime": {
"start_time": "2021-02-11T11:00:02.342215Z",
"end_time": "2021-02-11T11:00:02.577196Z"
}
},
"id": "earned-stupid",
"cell_type": "code",
"source": "text_seq = []\nfor sent in sent_tokenize(text):\n temp = [w.lower() for w in word_tokenize(sent)]\n text_seq.append(temp)",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true,
"ExecuteTime": {
"start_time": "2021-02-11T11:00:54.331264Z",
"end_time": "2021-02-11T11:00:54.987397Z"
}
},
"id": "studied-cable",
"cell_type": "code",
"source": "# Create CBOW model \nmodel1 = gensim.models.Word2Vec(text_seq, min_count = 1, \n size = 100, window = 5) \n \n# Print results \nprint(\"Cosine similarity between 'alice' \" + \n \"and 'wonderland' - CBOW : \", \n model1.similarity('alice', 'wonderland')) \n \nprint(\"Cosine similarity between 'alice' \" +\n \"and 'machines' - CBOW : \", \n model1.similarity('alice', 'machines'))",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2021-02-11T11:01:30.415552Z",
"end_time": "2021-02-11T11:01:31.267242Z"
},
"trusted": true
},
"id": "welcome-captain",
"cell_type": "code",
"source": "# Create Skip Gram model \nmodel2 = gensim.models.Word2Vec(text_seq, min_count = 1, size = 100, \n window = 5, sg = 1) \n \n# Print results \nprint(\"Cosine similarity between 'alice' \" +\n \"and 'wonderland' - Skip Gram : \", \n model2.similarity('alice', 'wonderland')) \n \nprint(\"Cosine similarity between 'alice' \" +\n \"and 'machines' - Skip Gram : \", \n model2.similarity('alice', 'machines')) ",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.7.6",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"nbTranslate": {
"hotkey": "alt-t",
"sourceLang": "en",
"targetLang": "fr",
"displayLangs": [],
"langInMainMenu": true,
"useGoogleTranslate": true
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": false,
"skip_h1_title": true,
"base_numbering": 1,
"title_cell": "Table des matières",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"window_display": false,
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"library": "var_list.py",
"delete_cmd_prefix": "del ",
"delete_cmd_postfix": "",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"library": "var_list.r",
"delete_cmd_prefix": "rm(",
"delete_cmd_postfix": ") ",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
]
},
"gist": {
"id": "",
"data": {
"description": "a minimal example of word vector in gensim",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment