xiaoouwang/wordvector in gensim.ipynb

## wordvector in gensim.ipynb
{
  "cells": [
    {
      "metadata": {
        "ExecuteTime": {
          "end_time": "2021-02-11T10:46:28.455821Z",
          "start_time": "2021-02-11T10:46:28.452645Z"
        },
        "trusted": true
      },
      "id": "important-tractor",
      "cell_type": "code",
      "source": "from nltk.tokenize import sent_tokenize, word_tokenize \nimport gensim \nfrom gensim.models import Word2Vec\nimport warnings \nwarnings.filterwarnings(action = 'ignore') \nimport urllib.request",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "end_time": "2021-02-11T10:54:35.389258Z",
          "start_time": "2021-02-11T10:54:34.190907Z"
        },
        "scrolled": false,
        "trusted": true
      },
      "id": "latin-vietnamese",
      "cell_type": "code",
      "source": "url = \"http://www.gutenberg.org/files/11/11-0.txt\"\ntext = urllib.request.urlopen(url).read().decode()\ntext.replace(\"\\n\",\" \")",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "ExecuteTime": {
          "start_time": "2021-02-11T11:00:02.342215Z",
          "end_time": "2021-02-11T11:00:02.577196Z"
        }
      },
      "id": "earned-stupid",
      "cell_type": "code",
      "source": "text_seq = []\nfor sent in sent_tokenize(text):\n    temp = [w.lower() for w in word_tokenize(sent)]\n    text_seq.append(temp)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "ExecuteTime": {
          "start_time": "2021-02-11T11:00:54.331264Z",
          "end_time": "2021-02-11T11:00:54.987397Z"
        }
      },
      "id": "studied-cable",
      "cell_type": "code",
      "source": "# Create CBOW model \nmodel1 = gensim.models.Word2Vec(text_seq, min_count = 1,  \n                              size = 100, window = 5) \n  \n# Print results \nprint(\"Cosine similarity between 'alice' \" + \n               \"and 'wonderland' - CBOW : \", \n    model1.similarity('alice', 'wonderland')) \n      \nprint(\"Cosine similarity between 'alice' \" +\n                 \"and 'machines' - CBOW : \", \n      model1.similarity('alice', 'machines'))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2021-02-11T11:01:30.415552Z",
          "end_time": "2021-02-11T11:01:31.267242Z"
        },
        "trusted": true
      },
      "id": "welcome-captain",
      "cell_type": "code",
      "source": "# Create Skip Gram model \nmodel2 = gensim.models.Word2Vec(text_seq, min_count = 1, size = 100, \n                                             window = 5, sg = 1) \n  \n# Print results \nprint(\"Cosine similarity between 'alice' \" +\n          \"and 'wonderland' - Skip Gram : \", \n    model2.similarity('alice', 'wonderland')) \n      \nprint(\"Cosine similarity between 'alice' \" +\n            \"and 'machines' - Skip Gram : \", \n      model2.similarity('alice', 'machines')) ",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.7.6",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "nbTranslate": {
      "hotkey": "alt-t",
      "sourceLang": "en",
      "targetLang": "fr",
      "displayLangs": [],
      "langInMainMenu": true,
      "useGoogleTranslate": true
    },
    "toc": {
      "nav_menu": {},
      "number_sections": true,
      "sideBar": false,
      "skip_h1_title": true,
      "base_numbering": 1,
      "title_cell": "Table des matières",
      "title_sidebar": "Contents",
      "toc_cell": false,
      "toc_position": {},
      "toc_section_display": true,
      "toc_window_display": false
    },
    "varInspector": {
      "window_display": false,
      "cols": {
        "lenName": 16,
        "lenType": 16,
        "lenVar": 40
      },
      "kernels_config": {
        "python": {
          "library": "var_list.py",
          "delete_cmd_prefix": "del ",
          "delete_cmd_postfix": "",
          "varRefreshCmd": "print(var_dic_list())"
        },
        "r": {
          "library": "var_list.r",
          "delete_cmd_prefix": "rm(",
          "delete_cmd_postfix": ") ",
          "varRefreshCmd": "cat(var_dic_list()) "
        }
      },
      "types_to_exclude": [
        "module",
        "function",
        "builtin_function_or_method",
        "instance",
        "_Feature"
      ]
    },
    "gist": {
      "id": "",
      "data": {
        "description": "a minimal example of word vector in gensim",
        "public": true
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
	{
	"cells": [
	{
	"metadata": {
	"ExecuteTime": {
	"end_time": "2021-02-11T10:46:28.455821Z",
	"start_time": "2021-02-11T10:46:28.452645Z"
	},
	"trusted": true
	},
	"id": "important-tractor",
	"cell_type": "code",
	"source": "from nltk.tokenize import sent_tokenize, word_tokenize \nimport gensim \nfrom gensim.models import Word2Vec\nimport warnings \nwarnings.filterwarnings(action = 'ignore') \nimport urllib.request",
	"execution_count": null,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"end_time": "2021-02-11T10:54:35.389258Z",
	"start_time": "2021-02-11T10:54:34.190907Z"
	},
	"scrolled": false,
	"trusted": true
	},
	"id": "latin-vietnamese",
	"cell_type": "code",
	"source": "url = \"http://www.gutenberg.org/files/11/11-0.txt\"\ntext = urllib.request.urlopen(url).read().decode()\ntext.replace(\"\\n\",\" \")",
	"execution_count": null,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true,
	"ExecuteTime": {
	"start_time": "2021-02-11T11:00:02.342215Z",
	"end_time": "2021-02-11T11:00:02.577196Z"
	}
	},
	"id": "earned-stupid",
	"cell_type": "code",
	"source": "text_seq = []\nfor sent in sent_tokenize(text):\n temp = [w.lower() for w in word_tokenize(sent)]\n text_seq.append(temp)",
	"execution_count": null,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true,
	"ExecuteTime": {
	"start_time": "2021-02-11T11:00:54.331264Z",
	"end_time": "2021-02-11T11:00:54.987397Z"
	}
	},
	"id": "studied-cable",
	"cell_type": "code",
	"source": "# Create CBOW model \nmodel1 = gensim.models.Word2Vec(text_seq, min_count = 1, \n size = 100, window = 5) \n \n# Print results \nprint(\"Cosine similarity between 'alice' \" + \n \"and 'wonderland' - CBOW : \", \n model1.similarity('alice', 'wonderland')) \n \nprint(\"Cosine similarity between 'alice' \" +\n \"and 'machines' - CBOW : \", \n model1.similarity('alice', 'machines'))",
	"execution_count": null,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2021-02-11T11:01:30.415552Z",
	"end_time": "2021-02-11T11:01:31.267242Z"
	},
	"trusted": true
	},
	"id": "welcome-captain",
	"cell_type": "code",
	"source": "# Create Skip Gram model \nmodel2 = gensim.models.Word2Vec(text_seq, min_count = 1, size = 100, \n window = 5, sg = 1) \n \n# Print results \nprint(\"Cosine similarity between 'alice' \" +\n \"and 'wonderland' - Skip Gram : \", \n model2.similarity('alice', 'wonderland')) \n \nprint(\"Cosine similarity between 'alice' \" +\n \"and 'machines' - Skip Gram : \", \n model2.similarity('alice', 'machines')) ",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3",
	"language": "python"
	},
	"language_info": {
	"name": "python",
	"version": "3.7.6",
	"mimetype": "text/x-python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"file_extension": ".py"
	},
	"nbTranslate": {
	"hotkey": "alt-t",
	"sourceLang": "en",
	"targetLang": "fr",
	"displayLangs": [],
	"langInMainMenu": true,
	"useGoogleTranslate": true
	},
	"toc": {
	"nav_menu": {},
	"number_sections": true,
	"sideBar": false,
	"skip_h1_title": true,
	"base_numbering": 1,
	"title_cell": "Table des matières",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	},
	"varInspector": {
	"window_display": false,
	"cols": {
	"lenName": 16,
	"lenType": 16,
	"lenVar": 40
	},
	"kernels_config": {
	"python": {
	"library": "var_list.py",
	"delete_cmd_prefix": "del ",
	"delete_cmd_postfix": "",
	"varRefreshCmd": "print(var_dic_list())"
	},
	"r": {
	"library": "var_list.r",
	"delete_cmd_prefix": "rm(",
	"delete_cmd_postfix": ") ",
	"varRefreshCmd": "cat(var_dic_list()) "
	}
	},
	"types_to_exclude": [
	"module",
	"function",
	"builtin_function_or_method",
	"instance",
	"_Feature"
	]
	},
	"gist": {
	"id": "",
	"data": {
	"description": "a minimal example of word vector in gensim",
	"public": true
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}