Last active
July 18, 2019 08:55
-
-
Save yongjun823/0491efd597289ae8a1bf9e853014827c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import pandas as pd\n", | |
"from tqdm import tqdm_notebook as tqdm\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>diffbotUri</th>\n", | |
" <th>images</th>\n", | |
" <th>specs</th>\n", | |
" <th>text</th>\n", | |
" <th>title</th>\n", | |
" <th>tokens</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>product|4|-1577418857</td>\n", | |
" <td>[product|4|-1577418857_1, product|4|-157741885...</td>\n", | |
" <td>{'size': 'All Sizes', 'color': 'Black'}</td>\n", | |
" <td>Our organization is involved in fabrication of...</td>\n", | |
" <td>Offering you a complete choice of products whi...</td>\n", | |
" <td>[Offering, you, a, complete, choice, of, produ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>product|4|2080697291</td>\n", | |
" <td>[product|4|2080697291_1, product|4|2080697291_0]</td>\n", | |
" <td>{'specifications': '-'}</td>\n", | |
" <td>We are engaged in offering wide range of Cast ...</td>\n", | |
" <td>Acrylic Sheet in Surat</td>\n", | |
" <td>[Acrylic, Sheet, in, Surat, We, are, engaged, ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>product|4|802308696</td>\n", | |
" <td>[product|4|802308696_0]</td>\n", | |
" <td>{'form_factor': 'Desktop', 'total_number_of_us...</td>\n", | |
" <td>Application data, virtual images, client files...</td>\n", | |
" <td>Netgear ReadyNAS 312 2-Bay, 2x2TB Desktop Drive</td>\n", | |
" <td>[Netgear, ReadyNAS, 312, 2-Bay, ,, 2x2TB, Desk...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" diffbotUri images \\\n", | |
"0 product|4|-1577418857 [product|4|-1577418857_1, product|4|-157741885... \n", | |
"1 product|4|2080697291 [product|4|2080697291_1, product|4|2080697291_0] \n", | |
"2 product|4|802308696 [product|4|802308696_0] \n", | |
"\n", | |
" specs \\\n", | |
"0 {'size': 'All Sizes', 'color': 'Black'} \n", | |
"1 {'specifications': '-'} \n", | |
"2 {'form_factor': 'Desktop', 'total_number_of_us... \n", | |
"\n", | |
" text \\\n", | |
"0 Our organization is involved in fabrication of... \n", | |
"1 We are engaged in offering wide range of Cast ... \n", | |
"2 Application data, virtual images, client files... \n", | |
"\n", | |
" title \\\n", | |
"0 Offering you a complete choice of products whi... \n", | |
"1 Acrylic Sheet in Surat \n", | |
"2 Netgear ReadyNAS 312 2-Bay, 2x2TB Desktop Drive \n", | |
"\n", | |
" tokens \n", | |
"0 [Offering, you, a, complete, choice, of, produ... \n", | |
"1 [Acrylic, Sheet, in, Surat, We, are, engaged, ... \n", | |
"2 [Netgear, ReadyNAS, 312, 2-Bay, ,, 2x2TB, Desk... " | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.read_json('sample/dataset-000-sample.json')\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dir_name = 'train/'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "001812e635e24d7191a1e8528ef72fa7", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=800), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"jsons = os.listdir(dir_name)\n", | |
"df_arr = []\n", | |
"\n", | |
"for val in tqdm(jsons):\n", | |
" df_t = pd.read_json(dir_name + val)\n", | |
" df_arr.append(df_t)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.concat(df_arr, ignore_index=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"RangeIndex: 224417 entries, 0 to 224416\n", | |
"Data columns (total 6 columns):\n", | |
"diffbotUri 224417 non-null object\n", | |
"images 224417 non-null object\n", | |
"specs 224417 non-null object\n", | |
"text 224417 non-null object\n", | |
"title 224417 non-null object\n", | |
"tokens 224417 non-null object\n", | |
"dtypes: object(6)\n", | |
"memory usage: 10.3+ MB\n" | |
] | |
} | |
], | |
"source": [ | |
"df.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"t_l = list(df['text'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1802466" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(t_l)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# token\n", | |
"For textual description D, we first tokenize the text using the Stanford tokenizer [natural language processing toolkit. ], \n", | |
"followed by embedding all of the words using the Glove algorithm [ In\n", | |
"Empirical Methods in Natural Language Processing (EMNLP)] \n", | |
"on all of the descriptions in\n", | |
"the training data. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import nltk \n", | |
"from nltk.tokenize import word_tokenize, sent_tokenize\n", | |
"from nltk.tokenize.stanford import StanfordTokenizer\n", | |
"from nltk.tag import pos_tag\n", | |
"from nltk.stem import PorterStemmer, WordNetLemmatizer\n", | |
"from textblob import TextBlob" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"col0 = df.iloc[1]\n", | |
"col0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nltk_tokens = word_tokenize(col0['title'] + ' ' + col0['text'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"temp_tokens = col0['tokens']\n", | |
"\n", | |
"print(nltk_tokens)\n", | |
"print()\n", | |
"print(temp_tokens)\n", | |
"# for t, n in zip(nltk_tokens, temp_tokens):\n", | |
"# print(t, n)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"set(temp_tokens) - set(nltk_tokens)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"nltk_tag = pos_tag(nltk_tokens)\n", | |
"\n", | |
"print(nltk_tag)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tb = TextBlob(col0['text'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tb.sentiment" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "7f2e8a070c114ce7b4330adb6815331a", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=224417), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"from concurrent.futures import ThreadPoolExecutor\n", | |
"\n", | |
"sentiment_arr = []\n", | |
"str_len_arr = []\n", | |
"\n", | |
"def f(x):\n", | |
" row = x[1]\n", | |
" polarity = TextBlob(row['text']).polarity\n", | |
" str_len = len(row['text'])\n", | |
" \n", | |
" return polarity, str_len\n", | |
"\n", | |
"with ThreadPoolExecutor() as executor:\n", | |
" result = executor.map(f, tqdm(df.iterrows(), total=224417))\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "2883570fdbc04b0ca3a7c6f5a85a9b82", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=224417), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"ename": "KeyboardInterrupt", | |
"evalue": "", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", | |
"\u001b[1;32m<ipython-input-10-d486a0588173>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtotal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m224417\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mpolarity\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTextBlob\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpolarity\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[0mstr_len\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\decorators.py\u001b[0m in \u001b[0;36m__get__\u001b[1;34m(self, obj, cls)\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mobj\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 24\u001b[1;33m \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__dict__\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 25\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 26\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\blob.py\u001b[0m in \u001b[0;36mpolarity\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 451\u001b[0m \u001b[1;33m:\u001b[0m\u001b[0mrtype\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mfloat\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 452\u001b[0m \"\"\"\n\u001b[1;32m--> 453\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mPatternAnalyzer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0manalyze\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mraw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 454\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 455\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mcached_property\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\en\\sentiments.py\u001b[0m in \u001b[0;36manalyze\u001b[1;34m(self, text, keep_assessments)\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 43\u001b[0m \u001b[0mSentiment\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnamedtuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Sentiment'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'polarity'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'subjectivity'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 44\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mSentiment\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mpattern_sentiment\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 45\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\_text.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, s, negation, **kwargs)\u001b[0m\n\u001b[0;32m 821\u001b[0m \u001b[1;31m# Sentiment(\"a horrible movie\") => (-0.6, 1.0)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 822\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbasestring\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 823\u001b[1;33m \u001b[0ma\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0massessments\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mw\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mw\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;34m\" \"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnegation\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 824\u001b[0m \u001b[1;31m# A pattern.en.Text.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 825\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"sentences\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\_text.py\u001b[0m in \u001b[0;36mfind_tokens\u001b[1;34m(self, string, **kwargs)\u001b[0m\n\u001b[0;32m 1175\u001b[0m \u001b[0mabbreviations\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"abbreviations\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mABBREVIATIONS\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1176\u001b[0m \u001b[0mreplace\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m \u001b[1;34m\"replace\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreplacements\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1177\u001b[1;33m linebreak = r\"\\n{2,}\")\n\u001b[0m\u001b[0;32m 1178\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1179\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mfind_tags\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtokens\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\_text.py\u001b[0m in \u001b[0;36mfind_tokens\u001b[1;34m(string, punctuation, abbreviations, replace, linebreak)\u001b[0m\n\u001b[0;32m 286\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpunctuation\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 287\u001b[0m \u001b[0mtokens\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mt\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m;\u001b[0m \u001b[0mt\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mt\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 288\u001b[1;33m \u001b[1;32mwhile\u001b[0m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mendswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpunctuation\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\".\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 289\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mt\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mreplace\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 290\u001b[0m \u001b[1;31m# Split trailing punctuation.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;31mKeyboardInterrupt\u001b[0m: " | |
] | |
} | |
], | |
"source": [ | |
"\n", | |
"sentiment_arr = []\n", | |
"str_len_arr = []\n", | |
"\n", | |
"for idx, row in tqdm(df.iterrows(), total=224417):\n", | |
" polarity = TextBlob(row['text']).polarity\n", | |
" str_len = len(row['text'])\n", | |
" \n", | |
" sentiment_arr.append(polarity)\n", | |
" str_len_arr.append(str_len)\n", | |
" \n", | |
"df['sentiment'] = sentiment_arr\n", | |
"df['length'] = str_len_arr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"plt.hist(df['sentiment'], 50)\n", | |
"plt.title(f'{dir_name[:-1]} sentiment')\n", | |
"plt.ylabel('count')\n", | |
"plt.xlabel('sentiment')\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"plt.hist(df['length'], 50)\n", | |
"plt.show()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import concurrent.futures | |
import os | |
import pandas as pd | |
import time | |
# from tqdm import tqdm_notebook as tqdm | |
from tqdm import tqdm | |
# import matplotlib.pyplot as plt | |
from textblob import TextBlob | |
def f(x): | |
polarity = TextBlob(x).polarity | |
return polarity | |
def main(): | |
dir_name = 'val/' | |
jsons = os.listdir(dir_name) | |
df_arr = [] | |
for val in tqdm(jsons): | |
df_t = pd.read_json(dir_name + val) | |
df_arr.append(df_t) | |
df = pd.concat(df_arr, ignore_index=True) | |
text_arr = list(df['text']) | |
sent_arr = [] | |
with concurrent.futures.ProcessPoolExecutor() as executor: | |
for sent in executor.map(f, tqdm(text_arr)): | |
sent_arr.append(sent) | |
print(sum(sent_arr) / len(sent_arr)) | |
if __name__ == '__main__': | |
s_time = time.time() | |
main() | |
print(f'end time {time.time() - s_time}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment