Skip to content

Instantly share code, notes, and snippets.

@cristiano74
Created May 9, 2018 15:59
Show Gist options
  • Save cristiano74/140c56f98c40d150d9df14af0f7a7c74 to your computer and use it in GitHub Desktop.
Save cristiano74/140c56f98c40d150d9df14af0f7a7c74 to your computer and use it in GitHub Desktop.
word2vec-nlp-tutorial.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#https://www.researchgate.net/post/How_to_find_semantic_similarity_between_two_documents\n\n#https://www.kaggle.com/c/word2vec-nlp-tutorial#part-2-word-vectors",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import pandas as pd\ntrain = pd.read_csv( \"labeledTrainData.tsv\", header=0, \n delimiter=\"\\t\", quoting=3 )",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"code_folding": [],
"trusted": true
},
"cell_type": "code",
"source": "# Read data from files \n\ntest = pd.read_csv( \"testData.tsv\", header=0, delimiter=\"\\t\", quoting=3 )\nunlabeled_train = pd.read_csv( \"unlabeledTrainData.tsv\", header=0,delimiter=\"\\t\", quoting=3 )\n\n# Verify the number of reviews that were read (100,000 in total)\nprint (\"Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews\\n\" % (train[\"review\"].size, \n test[\"review\"].size, unlabeled_train[\"review\"].size ))",
"execution_count": 12,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews\n\n"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Import various modules for string cleaning\nfrom bs4 import BeautifulSoup\nimport re\nfrom nltk.corpus import stopwords\n\ndef review_to_wordlist( review, remove_stopwords=False ):\n # Function to convert a document to a sequence of words,\n # optionally removing stop words. Returns a list of words.\n #\n # 1. Remove HTML\n review_text = BeautifulSoup(review).get_text()\n # \n # 2. Remove non-letters\n review_text = re.sub(\"[^a-zA-Z]\",\" \", review_text)\n #\n # 3. Convert words to lower case and split them\n words = review_text.lower().split()\n #\n # 4. Optionally remove stop words (false by default)\n if remove_stopwords:\n stops = set(stopwords.words(\"english\"))\n words = [w for w in words if not w in stops]\n #\n # 5. Return a list of words\n return(words)",
"execution_count": 14,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.6.3",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"varInspector": {
"window_display": false,
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"library": "var_list.py",
"delete_cmd_prefix": "del ",
"delete_cmd_postfix": "",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"library": "var_list.r",
"delete_cmd_prefix": "rm(",
"delete_cmd_postfix": ") ",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
]
},
"gist": {
"id": "",
"data": {
"description": "word2vec-nlp-tutorial.ipynb",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment