Skip to content

Instantly share code, notes, and snippets.

@yongjun823
Last active July 18, 2019 08:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yongjun823/0491efd597289ae8a1bf9e853014827c to your computer and use it in GitHub Desktop.
Save yongjun823/0491efd597289ae8a1bf9e853014827c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"from tqdm import tqdm_notebook as tqdm\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>diffbotUri</th>\n",
" <th>images</th>\n",
" <th>specs</th>\n",
" <th>text</th>\n",
" <th>title</th>\n",
" <th>tokens</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>product|4|-1577418857</td>\n",
" <td>[product|4|-1577418857_1, product|4|-157741885...</td>\n",
" <td>{'size': 'All Sizes', 'color': 'Black'}</td>\n",
" <td>Our organization is involved in fabrication of...</td>\n",
" <td>Offering you a complete choice of products whi...</td>\n",
" <td>[Offering, you, a, complete, choice, of, produ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>product|4|2080697291</td>\n",
" <td>[product|4|2080697291_1, product|4|2080697291_0]</td>\n",
" <td>{'specifications': '-'}</td>\n",
" <td>We are engaged in offering wide range of Cast ...</td>\n",
" <td>Acrylic Sheet in Surat</td>\n",
" <td>[Acrylic, Sheet, in, Surat, We, are, engaged, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>product|4|802308696</td>\n",
" <td>[product|4|802308696_0]</td>\n",
" <td>{'form_factor': 'Desktop', 'total_number_of_us...</td>\n",
" <td>Application data, virtual images, client files...</td>\n",
" <td>Netgear ReadyNAS 312 2-Bay, 2x2TB Desktop Drive</td>\n",
" <td>[Netgear, ReadyNAS, 312, 2-Bay, ,, 2x2TB, Desk...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" diffbotUri images \\\n",
"0 product|4|-1577418857 [product|4|-1577418857_1, product|4|-157741885... \n",
"1 product|4|2080697291 [product|4|2080697291_1, product|4|2080697291_0] \n",
"2 product|4|802308696 [product|4|802308696_0] \n",
"\n",
" specs \\\n",
"0 {'size': 'All Sizes', 'color': 'Black'} \n",
"1 {'specifications': '-'} \n",
"2 {'form_factor': 'Desktop', 'total_number_of_us... \n",
"\n",
" text \\\n",
"0 Our organization is involved in fabrication of... \n",
"1 We are engaged in offering wide range of Cast ... \n",
"2 Application data, virtual images, client files... \n",
"\n",
" title \\\n",
"0 Offering you a complete choice of products whi... \n",
"1 Acrylic Sheet in Surat \n",
"2 Netgear ReadyNAS 312 2-Bay, 2x2TB Desktop Drive \n",
"\n",
" tokens \n",
"0 [Offering, you, a, complete, choice, of, produ... \n",
"1 [Acrylic, Sheet, in, Surat, We, are, engaged, ... \n",
"2 [Netgear, ReadyNAS, 312, 2-Bay, ,, 2x2TB, Desk... "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_json('sample/dataset-000-sample.json')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"dir_name = 'train/'"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "001812e635e24d7191a1e8528ef72fa7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=800), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"jsons = os.listdir(dir_name)\n",
"df_arr = []\n",
"\n",
"for val in tqdm(jsons):\n",
" df_t = pd.read_json(dir_name + val)\n",
" df_arr.append(df_t)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat(df_arr, ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 224417 entries, 0 to 224416\n",
"Data columns (total 6 columns):\n",
"diffbotUri 224417 non-null object\n",
"images 224417 non-null object\n",
"specs 224417 non-null object\n",
"text 224417 non-null object\n",
"title 224417 non-null object\n",
"tokens 224417 non-null object\n",
"dtypes: object(6)\n",
"memory usage: 10.3+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"t_l = list(df['text'])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1802466"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(t_l)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# token\n",
"For textual description D, we first tokenize the text using the Stanford tokenizer [natural language processing toolkit. ], \n",
"followed by embedding all of the words using the Glove algorithm [ In\n",
"Empirical Methods in Natural Language Processing (EMNLP)] \n",
"on all of the descriptions in\n",
"the training data. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import nltk \n",
"from nltk.tokenize import word_tokenize, sent_tokenize\n",
"from nltk.tokenize.stanford import StanfordTokenizer\n",
"from nltk.tag import pos_tag\n",
"from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
"from textblob import TextBlob"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"col0 = df.iloc[1]\n",
"col0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nltk_tokens = word_tokenize(col0['title'] + ' ' + col0['text'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"temp_tokens = col0['tokens']\n",
"\n",
"print(nltk_tokens)\n",
"print()\n",
"print(temp_tokens)\n",
"# for t, n in zip(nltk_tokens, temp_tokens):\n",
"# print(t, n)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"set(temp_tokens) - set(nltk_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"nltk_tag = pos_tag(nltk_tokens)\n",
"\n",
"print(nltk_tag)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tb = TextBlob(col0['text'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tb.sentiment"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7f2e8a070c114ce7b4330adb6815331a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=224417), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from concurrent.futures import ThreadPoolExecutor\n",
"\n",
"sentiment_arr = []\n",
"str_len_arr = []\n",
"\n",
"def f(x):\n",
" row = x[1]\n",
" polarity = TextBlob(row['text']).polarity\n",
" str_len = len(row['text'])\n",
" \n",
" return polarity, str_len\n",
"\n",
"with ThreadPoolExecutor() as executor:\n",
" result = executor.map(f, tqdm(df.iterrows(), total=224417))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2883570fdbc04b0ca3a7c6f5a85a9b82",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=224417), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-10-d486a0588173>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtotal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m224417\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mpolarity\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTextBlob\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpolarity\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[0mstr_len\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\decorators.py\u001b[0m in \u001b[0;36m__get__\u001b[1;34m(self, obj, cls)\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mobj\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 24\u001b[1;33m \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__dict__\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 25\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 26\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\blob.py\u001b[0m in \u001b[0;36mpolarity\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 451\u001b[0m \u001b[1;33m:\u001b[0m\u001b[0mrtype\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mfloat\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 452\u001b[0m \"\"\"\n\u001b[1;32m--> 453\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mPatternAnalyzer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0manalyze\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mraw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 454\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 455\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mcached_property\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\en\\sentiments.py\u001b[0m in \u001b[0;36manalyze\u001b[1;34m(self, text, keep_assessments)\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 43\u001b[0m \u001b[0mSentiment\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnamedtuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Sentiment'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'polarity'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'subjectivity'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 44\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mSentiment\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mpattern_sentiment\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 45\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\_text.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, s, negation, **kwargs)\u001b[0m\n\u001b[0;32m 821\u001b[0m \u001b[1;31m# Sentiment(\"a horrible movie\") => (-0.6, 1.0)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 822\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbasestring\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 823\u001b[1;33m \u001b[0ma\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0massessments\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mw\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mw\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;34m\" \"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnegation\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 824\u001b[0m \u001b[1;31m# A pattern.en.Text.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 825\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"sentences\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\_text.py\u001b[0m in \u001b[0;36mfind_tokens\u001b[1;34m(self, string, **kwargs)\u001b[0m\n\u001b[0;32m 1175\u001b[0m \u001b[0mabbreviations\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"abbreviations\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mABBREVIATIONS\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1176\u001b[0m \u001b[0mreplace\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m \u001b[1;34m\"replace\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreplacements\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1177\u001b[1;33m linebreak = r\"\\n{2,}\")\n\u001b[0m\u001b[0;32m 1178\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1179\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mfind_tags\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtokens\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\textblob\\_text.py\u001b[0m in \u001b[0;36mfind_tokens\u001b[1;34m(string, punctuation, abbreviations, replace, linebreak)\u001b[0m\n\u001b[0;32m 286\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpunctuation\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 287\u001b[0m \u001b[0mtokens\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mt\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m;\u001b[0m \u001b[0mt\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mt\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 288\u001b[1;33m \u001b[1;32mwhile\u001b[0m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mendswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpunctuation\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\".\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 289\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mt\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mreplace\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 290\u001b[0m \u001b[1;31m# Split trailing punctuation.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"\n",
"sentiment_arr = []\n",
"str_len_arr = []\n",
"\n",
"for idx, row in tqdm(df.iterrows(), total=224417):\n",
" polarity = TextBlob(row['text']).polarity\n",
" str_len = len(row['text'])\n",
" \n",
" sentiment_arr.append(polarity)\n",
" str_len_arr.append(str_len)\n",
" \n",
"df['sentiment'] = sentiment_arr\n",
"df['length'] = str_len_arr"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.hist(df['sentiment'], 50)\n",
"plt.title(f'{dir_name[:-1]} sentiment')\n",
"plt.ylabel('count')\n",
"plt.xlabel('sentiment')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.hist(df['length'], 50)\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
import concurrent.futures
import os
import pandas as pd
import time
# from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
# import matplotlib.pyplot as plt
from textblob import TextBlob
def f(x):
polarity = TextBlob(x).polarity
return polarity
def main():
dir_name = 'val/'
jsons = os.listdir(dir_name)
df_arr = []
for val in tqdm(jsons):
df_t = pd.read_json(dir_name + val)
df_arr.append(df_t)
df = pd.concat(df_arr, ignore_index=True)
text_arr = list(df['text'])
sent_arr = []
with concurrent.futures.ProcessPoolExecutor() as executor:
for sent in executor.map(f, tqdm(text_arr)):
sent_arr.append(sent)
print(sum(sent_arr) / len(sent_arr))
if __name__ == '__main__':
s_time = time.time()
main()
print(f'end time {time.time() - s_time}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment