carlleston/2-ETL_news_Studio.ipynb

## 2-ETL_news_Studio.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ETL News"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# @hidden_cell\n",
    "# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.\n",
    "from project_lib import Project\n",
    "project = Project(project_id='********-****-******-****-**********', project_access_token='p-************************************')\n",
    "pc = project.project_context"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip install google-api-python-client==1.8.3\n",
    "!pip install scrapy==2.1.0\n",
    "!pip install attrs==19.3.0\n",
    "!pip install scikit-learn==0.22\n",
    "!pip install h5py==2.10.0\n",
    "!pip install tensorflow==2.2.0rc0\n",
    "!pip install keras==2.3.1\n",
    "!pip install nltk==3.4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "## Google Search API\n",
    "from googleapiclient.discovery import build\n",
    "\n",
    "### Crawler Scrapy\n",
    "import unicodedata\n",
    "import scrapy\n",
    "import requests\n",
    "from scrapy.http import TextResponse\n",
    "import json\n",
    "import string\n",
    "from nltk import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "### RNN Classifier\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "\n",
    "from sklearn import preprocessing\n",
    "\n",
    "import nltk\n",
    "import unicodedata\n",
    "import string\n",
    "import re\n",
    "import seaborn as sns\n",
    "import itertools\n",
    "import pickle\n",
    "\n",
    "from nltk import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "from keras.models import load_model\n",
    "\n",
    "DIM_GLOVE = 50\n",
    "n_words = 20000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Google api\n",
    "from googleapiclient.discovery import build\n",
    "\n",
    "\n",
    "api_key = \"*********************************\"\n",
    "\n",
    "my_api_key = \"**********************************\"\n",
    "my_cse_id = \"*******************************\"  ## Custom Search element control API\n",
    "\n",
    "def google_search(search_term, api_key, cse_id, **kwargs):\n",
    "    service = build(\"customsearch\", \"v1\", developerKey=api_key)\n",
    "    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: Logging before flag parsing goes to stderr.\n",
      "W0703 03:26:29.753084 139768582977344 __init__.py:46] file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py\", line 36, in autodetect\n",
      "    from google.appengine.api import memcache\n",
      "ModuleNotFoundError: No module named 'google.appengine'\n",
      "\n",
      "During handling of the above exception, another exception occurred:\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 33, in <module>\n",
      "    from oauth2client.contrib.locked_file import LockedFile\n",
      "ModuleNotFoundError: No module named 'oauth2client'\n",
      "\n",
      "During handling of the above exception, another exception occurred:\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 37, in <module>\n",
      "    from oauth2client.locked_file import LockedFile\n",
      "ModuleNotFoundError: No module named 'oauth2client'\n",
      "\n",
      "During handling of the above exception, another exception occurred:\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py\", line 42, in autodetect\n",
      "    from . import file_cache\n",
      "  File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 41, in <module>\n",
      "    \"file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\"\n",
      "ImportError: file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\n"
     ]
    }
   ],
   "source": [
    "dic_search = google_search(\"itaú\", my_api_key, my_cse_id, num=10)\n",
    "df_search = pd.DataFrame(list(dic_search.items())[5][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import types\n",
    "import pandas as pd\n",
    "from botocore.client import Config\n",
    "import ibm_boto3\n",
    "\n",
    "def __iter__(self): return 0\n",
    "\n",
    "# @hidden_cell\n",
    "# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.\n",
    "# You might want to remove those credentials before you share the notebook.\n",
    "client_09d659ed064a4c049d2d1b42b7c7853b = ibm_boto3.client(service_name='s3',\n",
    "    ibm_api_key_id='********************************_*',\n",
    "    ibm_auth_endpoint=\"https://iam.cloud.ibm.com/oidc/token\",\n",
    "    config=Config(signature_version='oauth'),\n",
    "    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')\n",
    "\n",
    "body = client_*************************.get_object(Bucket='*********************-donotdelete-**-***********',Key='feature_news.csv')['Body']\n",
    "# add missing __iter__ method, so pandas accepts body as file-like object\n",
    "if not hasattr(body, \"__iter__\"): body.__iter__ = types.MethodType( __iter__, body )\n",
    "\n",
    "feature_news = pd.read_csv(body)\n",
    "filter_link = df_search[~df_search['link'].isin(feature_news.Link)].reset_index(drop=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "## remove the quotes from the news\n",
    "filter_itau_news = pd.DataFrame()\n",
    "for i,x in enumerate(~filter_link['link'].str.contains('cotacoes')):   ### removing the quotation page information\n",
    "    if x:\n",
    "        filter_itau_news= filter_itau_news.append(df_search[i:i+1])  \n",
    "filter_itau_news = filter_itau_news[['link','snippet','title','pagemap']]   ## filter cols in link, snippet and title\n",
    "filter_itau_news = filter_itau_news.reset_index()\n",
    "del filter_itau_news['index']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "itau_news = pd.DataFrame(columns= ['Date','Link','Text'])\n",
    "for i,x in enumerate(filter_itau_news['link']):\n",
    "    res = requests.get(x)\n",
    "    response = TextResponse(res.url, body=res.text, encoding='utf-8')\n",
    "    lst = pd.DataFrame([[filter_itau_news['pagemap'][i]['metatags'][0]['article:published_time'][:10],filter_itau_news['link'][i], response.css(\".article-content p::text\").extract()]], columns= ['Date','Link','Text'])\n",
    "    itau_news = itau_news.append(lst)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/dsxuser/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /home/dsxuser/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "def strip_accents(sentence):\n",
    "    try:\n",
    "        sentence = unicode(sentence, 'utf-8')\n",
    "    except NameError: # unicode is a default on python 3 \n",
    "        pass\n",
    "\n",
    "    sentence = unicodedata.normalize('NFD', sentence)\\\n",
    "           .encode('ascii', 'ignore')\\\n",
    "           .decode(\"utf-8\")\n",
    "    return str(sentence)\n",
    "\n",
    "def remove_punction(text):\n",
    "    no_punct = \"\".join(c for c in text if c not in string.punctuation)\n",
    "    return no_punct\n",
    "\n",
    "def remove_number(text):\n",
    "    no_number = ''.join([i for i in text if not i.isdigit()])\n",
    "    return no_number\n",
    "\n",
    "def lower_caser(text):\n",
    "    return text.lower()\n",
    "\n",
    "def remove_stopwords(text):\n",
    "        tokens = word_tokenize(text)\n",
    "        tokens = [t for t in tokens if not t in stopwords.words('portuguese')]\n",
    "        return \" \".join(map(str.strip, tokens))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "itau_news['Text'] = itau_news['Text'].apply(remove_number)\n",
    "itau_news['Text'] = itau_news['Text'].apply(remove_stopwords)\n",
    "itau_news['Text'] = itau_news['Text'].apply(lower_caser)\n",
    "itau_news['Text'] = itau_news['Text'].apply(strip_accents)\n",
    "itau_news['Text'] = itau_news['Text'].apply(remove_punction)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/Python36/lib/python3.6/site-packages/tensorflow/python/framework/indexed_slices.py:434: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n",
      "/opt/conda/envs/Python36/lib/python3.6/site-packages/tensorflow/python/framework/indexed_slices.py:434: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    }
   ],
   "source": [
    "from io import BytesIO\n",
    "model = load_model(project.get_file('RNNModel.tflearn'))\n",
    "train_le = preprocessing.LabelEncoder() # creating a labelencoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Your data file was loaded into a botocore.response.StreamingBody object.\n",
    "# Please read the documentation of ibm_boto3 and pandas to learn more about the possibilities to load the data.\n",
    "# ibm_boto3 documentation: https://ibm.github.io/ibm-cos-sdk-python/\n",
    "# pandas documentation: http://pandas.pydata.org/\n",
    "streaming_body_2 = client_*****************************.get_object(Bucket='*********************-donotdelete-**-***********', Key='label_encoder.pkl')['Body']\n",
    "# add missing __iter__ method so pandas accepts body as file-like object\n",
    "if not hasattr(streaming_body_2, \"__iter__\"): streaming_body_2.__iter__ = types.MethodType( __iter__, streaming_body_2 ) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.preprocessing.label module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.preprocessing. Anything that cannot be imported from sklearn.preprocessing is now part of the private API.\n",
      "  warnings.warn(message, FutureWarning)\n",
      "/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LabelEncoder()"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from io import BytesIO\n",
    "readrawdata = streaming_body_2.read()\n",
    "label_encoder = pickle.load(BytesIO(readrawdata))\n",
    "train_le.fit(label_encoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def RNN(Text):\n",
    "    test_texts = [Text]\n",
    "    tokenizer = Tokenizer(num_words=n_words)\n",
    "    tokenizer.fit_on_texts(test_texts)\n",
    "    test_sequences = tokenizer.texts_to_sequences(test_texts)\n",
    "    test_input = pad_sequences(test_sequences, maxlen=DIM_GLOVE)\n",
    "    test_predictions_probas = model.predict(test_input)\n",
    "    test_predictions = test_predictions_probas.argmax(axis=-1)\n",
    "    test_intent_predictions = train_le.inverse_transform(test_predictions)\n",
    "    df_probas = pd.DataFrame(test_predictions_probas)\n",
    "    df_intent = pd.DataFrame(test_intent_predictions)\n",
    "    frames = [df_probas, df_intent]\n",
    "    result = pd.concat(frames)\n",
    "    result = pd.concat([df_probas, df_intent], axis=1)\n",
    "    x = result.transpose()[0:104]     \n",
    "    x.columns = ['probability']\n",
    "    x = pd.to_numeric(x['probability'][:3])\n",
    "    x = pd.DataFrame(x)\n",
    "    labels = pd.DataFrame(label_encoder)\n",
    "    labels = labels.reset_index()\n",
    "    labels.columns = ['Labels','number']\n",
    "    result_table= labels.join(x)\n",
    "    result_table = result_table.sort_values(by=['probability'], ascending = False)\n",
    "    result_table.head(5)\n",
    "    return result_table.head(5).reset_index(drop = True)['number'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "itau_news['Class'] = itau_news['Text'].apply(RNN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_news = feature_news.append(itau_news[['Date','Link','Class']]).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'project' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-91a2b3e38107>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m## Overwrite the csv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mproject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeature_news\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfile_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'itau_news.csv'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0moverwrite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'project' is not defined"
     ]
    }
   ],
   "source": [
    "## Overwrite the csv\n",
    "project.save_data(data=feature_news.to_csv(),file_name='itau_news.csv',overwrite=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## ETL News"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"# @hidden_cell\n",
	"# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.\n",
	"from project_lib import Project\n",
	"project = Project(project_id='******--**--******', project_access_token='p-**********************************')\n",
	"pc = project.project_context"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"%%capture\n",
	"!pip install google-api-python-client==1.8.3\n",
	"!pip install scrapy==2.1.0\n",
	"!pip install attrs==19.3.0\n",
	"!pip install scikit-learn==0.22\n",
	"!pip install h5py==2.10.0\n",
	"!pip install tensorflow==2.2.0rc0\n",
	"!pip install keras==2.3.1\n",
	"!pip install nltk==3.4"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Using TensorFlow backend.\n"
	]
	}
	],
	"source": [
	"import pandas as pd\n",
	"\n",
	"## Google Search API\n",
	"from googleapiclient.discovery import build\n",
	"\n",
	"### Crawler Scrapy\n",
	"import unicodedata\n",
	"import scrapy\n",
	"import requests\n",
	"from scrapy.http import TextResponse\n",
	"import json\n",
	"import string\n",
	"from nltk import word_tokenize\n",
	"from nltk.corpus import stopwords\n",
	"\n",
	"### RNN Classifier\n",
	"from keras.preprocessing.text import Tokenizer\n",
	"from keras.preprocessing.sequence import pad_sequences\n",
	"\n",
	"from sklearn import preprocessing\n",
	"\n",
	"import nltk\n",
	"import unicodedata\n",
	"import string\n",
	"import re\n",
	"import seaborn as sns\n",
	"import itertools\n",
	"import pickle\n",
	"\n",
	"from nltk import word_tokenize\n",
	"from nltk.corpus import stopwords\n",
	"from keras.models import load_model\n",
	"\n",
	"DIM_GLOVE = 50\n",
	"n_words = 20000"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"## Google api\n",
	"from googleapiclient.discovery import build\n",
	"\n",
	"\n",
	"api_key = \"*********************************\"\n",
	"\n",
	"my_api_key = \"**********************************\"\n",
	"my_cse_id = \"*******************************\" ## Custom Search element control API\n",
	"\n",
	"def google_search(search_term, api_key, cse_id, **kwargs):\n",
	" service = build(\"customsearch\", \"v1\", developerKey=api_key)\n",
	" res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()\n",
	" return res"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"WARNING: Logging before flag parsing goes to stderr.\n",
	"W0703 03:26:29.753084 139768582977344 __init__.py:46] file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\n",
	"Traceback (most recent call last):\n",
	" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py\", line 36, in autodetect\n",
	" from google.appengine.api import memcache\n",
	"ModuleNotFoundError: No module named 'google.appengine'\n",
	"\n",
	"During handling of the above exception, another exception occurred:\n",
	"\n",
	"Traceback (most recent call last):\n",
	" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 33, in <module>\n",
	" from oauth2client.contrib.locked_file import LockedFile\n",
	"ModuleNotFoundError: No module named 'oauth2client'\n",
	"\n",
	"During handling of the above exception, another exception occurred:\n",
	"\n",
	"Traceback (most recent call last):\n",
	" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 37, in <module>\n",
	" from oauth2client.locked_file import LockedFile\n",
	"ModuleNotFoundError: No module named 'oauth2client'\n",
	"\n",
	"During handling of the above exception, another exception occurred:\n",
	"\n",
	"Traceback (most recent call last):\n",
	" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py\", line 42, in autodetect\n",
	" from . import file_cache\n",
	" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 41, in <module>\n",
	" \"file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\"\n",
	"ImportError: file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\n"
	]
	}
	],
	"source": [
	"dic_search = google_search(\"itaú\", my_api_key, my_cse_id, num=10)\n",
	"df_search = pd.DataFrame(list(dic_search.items())[5][1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"import types\n",
	"import pandas as pd\n",
	"from botocore.client import Config\n",
	"import ibm_boto3\n",
	"\n",
	"def __iter__(self): return 0\n",
	"\n",
	"# @hidden_cell\n",
	"# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.\n",
	"# You might want to remove those credentials before you share the notebook.\n",
	"client_09d659ed064a4c049d2d1b42b7c7853b = ibm_boto3.client(service_name='s3',\n",
	" ibm_api_key_id='*******************************_',\n",
	" ibm_auth_endpoint=\"https://iam.cloud.ibm.com/oidc/token\",\n",
	" config=Config(signature_version='oauth'),\n",
	" endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')\n",
	"\n",
	"body = client_***********************.get_object(Bucket='*****************-donotdelete--***********',Key='feature_news.csv')['Body']\n",
	"# add missing __iter__ method, so pandas accepts body as file-like object\n",
	"if not hasattr(body, \"__iter__\"): body.__iter__ = types.MethodType( __iter__, body )\n",
	"\n",
	"feature_news = pd.read_csv(body)\n",
	"filter_link = df_search[~df_search['link'].isin(feature_news.Link)].reset_index(drop=True)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"## remove the quotes from the news\n",
	"filter_itau_news = pd.DataFrame()\n",
	"for i,x in enumerate(~filter_link['link'].str.contains('cotacoes')): ### removing the quotation page information\n",
	" if x:\n",
	" filter_itau_news= filter_itau_news.append(df_search[i:i+1]) \n",
	"filter_itau_news = filter_itau_news[['link','snippet','title','pagemap']] ## filter cols in link, snippet and title\n",
	"filter_itau_news = filter_itau_news.reset_index()\n",
	"del filter_itau_news['index']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"import requests\n",
	"itau_news = pd.DataFrame(columns= ['Date','Link','Text'])\n",
	"for i,x in enumerate(filter_itau_news['link']):\n",
	" res = requests.get(x)\n",
	" response = TextResponse(res.url, body=res.text, encoding='utf-8')\n",
	" lst = pd.DataFrame([[filter_itau_news['pagemap'][i]['metatags'][0]['article:published_time'][:10],filter_itau_news['link'][i], response.css(\".article-content p::text\").extract()]], columns= ['Date','Link','Text'])\n",
	" itau_news = itau_news.append(lst)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[nltk_data] Downloading package punkt to /home/dsxuser/nltk_data...\n",
	"[nltk_data] Package punkt is already up-to-date!\n",
	"[nltk_data] Downloading package stopwords to\n",
	"[nltk_data] /home/dsxuser/nltk_data...\n",
	"[nltk_data] Package stopwords is already up-to-date!\n"
	]
	}
	],
	"source": [
	"import nltk\n",
	"nltk.download('punkt')\n",
	"nltk.download('stopwords')\n",
	"def strip_accents(sentence):\n",
	" try:\n",
	" sentence = unicode(sentence, 'utf-8')\n",
	" except NameError: # unicode is a default on python 3 \n",
	" pass\n",
	"\n",
	" sentence = unicodedata.normalize('NFD', sentence)\\\n",
	" .encode('ascii', 'ignore')\\\n",
	" .decode(\"utf-8\")\n",
	" return str(sentence)\n",
	"\n",
	"def remove_punction(text):\n",
	" no_punct = \"\".join(c for c in text if c not in string.punctuation)\n",
	" return no_punct\n",
	"\n",
	"def remove_number(text):\n",
	" no_number = ''.join([i for i in text if not i.isdigit()])\n",
	" return no_number\n",
	"\n",
	"def lower_caser(text):\n",
	" return text.lower()\n",
	"\n",
	"def remove_stopwords(text):\n",
	" tokens = word_tokenize(text)\n",
	" tokens = [t for t in tokens if not t in stopwords.words('portuguese')]\n",
	" return \" \".join(map(str.strip, tokens))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"itau_news['Text'] = itau_news['Text'].apply(remove_number)\n",
	"itau_news['Text'] = itau_news['Text'].apply(remove_stopwords)\n",
	"itau_news['Text'] = itau_news['Text'].apply(lower_caser)\n",
	"itau_news['Text'] = itau_news['Text'].apply(strip_accents)\n",
	"itau_news['Text'] = itau_news['Text'].apply(remove_punction)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/opt/conda/envs/Python36/lib/python3.6/site-packages/tensorflow/python/framework/indexed_slices.py:434: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
	" \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n",
	"/opt/conda/envs/Python36/lib/python3.6/site-packages/tensorflow/python/framework/indexed_slices.py:434: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
	" \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
	]
	}
	],
	"source": [
	"from io import BytesIO\n",
	"model = load_model(project.get_file('RNNModel.tflearn'))\n",
	"train_le = preprocessing.LabelEncoder() # creating a labelencoder"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Your data file was loaded into a botocore.response.StreamingBody object.\n",
	"# Please read the documentation of ibm_boto3 and pandas to learn more about the possibilities to load the data.\n",
	"# ibm_boto3 documentation: https://ibm.github.io/ibm-cos-sdk-python/\n",
	"# pandas documentation: http://pandas.pydata.org/\n",
	"streaming_body_2 = client_***************************.get_object(Bucket='*****************-donotdelete--***********', Key='label_encoder.pkl')['Body']\n",
	"# add missing __iter__ method so pandas accepts body as file-like object\n",
	"if not hasattr(streaming_body_2, \"__iter__\"): streaming_body_2.__iter__ = types.MethodType( __iter__, streaming_body_2 ) \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.preprocessing.label module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.preprocessing. Anything that cannot be imported from sklearn.preprocessing is now part of the private API.\n",
	" warnings.warn(message, FutureWarning)\n",
	"/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
	" warnings.warn(msg, category=FutureWarning)\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"LabelEncoder()"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from io import BytesIO\n",
	"readrawdata = streaming_body_2.read()\n",
	"label_encoder = pickle.load(BytesIO(readrawdata))\n",
	"train_le.fit(label_encoder)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"def RNN(Text):\n",
	" test_texts = [Text]\n",
	" tokenizer = Tokenizer(num_words=n_words)\n",
	" tokenizer.fit_on_texts(test_texts)\n",
	" test_sequences = tokenizer.texts_to_sequences(test_texts)\n",
	" test_input = pad_sequences(test_sequences, maxlen=DIM_GLOVE)\n",
	" test_predictions_probas = model.predict(test_input)\n",
	" test_predictions = test_predictions_probas.argmax(axis=-1)\n",
	" test_intent_predictions = train_le.inverse_transform(test_predictions)\n",
	" df_probas = pd.DataFrame(test_predictions_probas)\n",
	" df_intent = pd.DataFrame(test_intent_predictions)\n",
	" frames = [df_probas, df_intent]\n",
	" result = pd.concat(frames)\n",
	" result = pd.concat([df_probas, df_intent], axis=1)\n",
	" x = result.transpose()[0:104] \n",
	" x.columns = ['probability']\n",
	" x = pd.to_numeric(x['probability'][:3])\n",
	" x = pd.DataFrame(x)\n",
	" labels = pd.DataFrame(label_encoder)\n",
	" labels = labels.reset_index()\n",
	" labels.columns = ['Labels','number']\n",
	" result_table= labels.join(x)\n",
	" result_table = result_table.sort_values(by=['probability'], ascending = False)\n",
	" result_table.head(5)\n",
	" return result_table.head(5).reset_index(drop = True)['number'][0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"itau_news['Class'] = itau_news['Text'].apply(RNN)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"feature_news = feature_news.append(itau_news[['Date','Link','Class']]).reset_index(drop=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"ename": "NameError",
	"evalue": "name 'project' is not defined",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-1-91a2b3e38107>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m## Overwrite the csv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mproject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeature_news\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfile_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'itau_news.csv'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0moverwrite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
	"\u001b[0;31mNameError\u001b[0m: name 'project' is not defined"
	]
	}
	],
	"source": [
	"## Overwrite the csv\n",
	"project.save_data(data=feature_news.to_csv(),file_name='itau_news.csv',overwrite=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}