Skip to content

Instantly share code, notes, and snippets.

@carlleston
Created July 28, 2020 12:57
Show Gist options
  • Save carlleston/01e0d7cf121fbfcc7b62d59148ed8ce3 to your computer and use it in GitHub Desktop.
Save carlleston/01e0d7cf121fbfcc7b62d59148ed8ce3 to your computer and use it in GitHub Desktop.
Extract, transform, classify the news and load.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ETL News"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# @hidden_cell\n",
"# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.\n",
"from project_lib import Project\n",
"project = Project(project_id='********-****-******-****-**********', project_access_token='p-************************************')\n",
"pc = project.project_context"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"%%capture\n",
"!pip install google-api-python-client==1.8.3\n",
"!pip install scrapy==2.1.0\n",
"!pip install attrs==19.3.0\n",
"!pip install scikit-learn==0.22\n",
"!pip install h5py==2.10.0\n",
"!pip install tensorflow==2.2.0rc0\n",
"!pip install keras==2.3.1\n",
"!pip install nltk==3.4"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"## Google Search API\n",
"from googleapiclient.discovery import build\n",
"\n",
"### Crawler Scrapy\n",
"import unicodedata\n",
"import scrapy\n",
"import requests\n",
"from scrapy.http import TextResponse\n",
"import json\n",
"import string\n",
"from nltk import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"\n",
"### RNN Classifier\n",
"from keras.preprocessing.text import Tokenizer\n",
"from keras.preprocessing.sequence import pad_sequences\n",
"\n",
"from sklearn import preprocessing\n",
"\n",
"import nltk\n",
"import unicodedata\n",
"import string\n",
"import re\n",
"import seaborn as sns\n",
"import itertools\n",
"import pickle\n",
"\n",
"from nltk import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from keras.models import load_model\n",
"\n",
"DIM_GLOVE = 50\n",
"n_words = 20000"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"## Google api\n",
"from googleapiclient.discovery import build\n",
"\n",
"\n",
"api_key = \"*********************************\"\n",
"\n",
"my_api_key = \"**********************************\"\n",
"my_cse_id = \"*******************************\" ## Custom Search element control API\n",
"\n",
"def google_search(search_term, api_key, cse_id, **kwargs):\n",
" service = build(\"customsearch\", \"v1\", developerKey=api_key)\n",
" res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()\n",
" return res"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: Logging before flag parsing goes to stderr.\n",
"W0703 03:26:29.753084 139768582977344 __init__.py:46] file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\n",
"Traceback (most recent call last):\n",
" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py\", line 36, in autodetect\n",
" from google.appengine.api import memcache\n",
"ModuleNotFoundError: No module named 'google.appengine'\n",
"\n",
"During handling of the above exception, another exception occurred:\n",
"\n",
"Traceback (most recent call last):\n",
" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 33, in <module>\n",
" from oauth2client.contrib.locked_file import LockedFile\n",
"ModuleNotFoundError: No module named 'oauth2client'\n",
"\n",
"During handling of the above exception, another exception occurred:\n",
"\n",
"Traceback (most recent call last):\n",
" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 37, in <module>\n",
" from oauth2client.locked_file import LockedFile\n",
"ModuleNotFoundError: No module named 'oauth2client'\n",
"\n",
"During handling of the above exception, another exception occurred:\n",
"\n",
"Traceback (most recent call last):\n",
" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py\", line 42, in autodetect\n",
" from . import file_cache\n",
" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 41, in <module>\n",
" \"file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\"\n",
"ImportError: file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\n"
]
}
],
"source": [
"dic_search = google_search(\"itaú\", my_api_key, my_cse_id, num=10)\n",
"df_search = pd.DataFrame(list(dic_search.items())[5][1])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import types\n",
"import pandas as pd\n",
"from botocore.client import Config\n",
"import ibm_boto3\n",
"\n",
"def __iter__(self): return 0\n",
"\n",
"# @hidden_cell\n",
"# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.\n",
"# You might want to remove those credentials before you share the notebook.\n",
"client_09d659ed064a4c049d2d1b42b7c7853b = ibm_boto3.client(service_name='s3',\n",
" ibm_api_key_id='********************************_*',\n",
" ibm_auth_endpoint=\"https://iam.cloud.ibm.com/oidc/token\",\n",
" config=Config(signature_version='oauth'),\n",
" endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')\n",
"\n",
"body = client_*************************.get_object(Bucket='*********************-donotdelete-**-***********',Key='feature_news.csv')['Body']\n",
"# add missing __iter__ method, so pandas accepts body as file-like object\n",
"if not hasattr(body, \"__iter__\"): body.__iter__ = types.MethodType( __iter__, body )\n",
"\n",
"feature_news = pd.read_csv(body)\n",
"filter_link = df_search[~df_search['link'].isin(feature_news.Link)].reset_index(drop=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"## remove the quotes from the news\n",
"filter_itau_news = pd.DataFrame()\n",
"for i,x in enumerate(~filter_link['link'].str.contains('cotacoes')): ### removing the quotation page information\n",
" if x:\n",
" filter_itau_news= filter_itau_news.append(df_search[i:i+1]) \n",
"filter_itau_news = filter_itau_news[['link','snippet','title','pagemap']] ## filter cols in link, snippet and title\n",
"filter_itau_news = filter_itau_news.reset_index()\n",
"del filter_itau_news['index']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"itau_news = pd.DataFrame(columns= ['Date','Link','Text'])\n",
"for i,x in enumerate(filter_itau_news['link']):\n",
" res = requests.get(x)\n",
" response = TextResponse(res.url, body=res.text, encoding='utf-8')\n",
" lst = pd.DataFrame([[filter_itau_news['pagemap'][i]['metatags'][0]['article:published_time'][:10],filter_itau_news['link'][i], response.css(\".article-content p::text\").extract()]], columns= ['Date','Link','Text'])\n",
" itau_news = itau_news.append(lst)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/dsxuser/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /home/dsxuser/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
}
],
"source": [
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"def strip_accents(sentence):\n",
" try:\n",
" sentence = unicode(sentence, 'utf-8')\n",
" except NameError: # unicode is a default on python 3 \n",
" pass\n",
"\n",
" sentence = unicodedata.normalize('NFD', sentence)\\\n",
" .encode('ascii', 'ignore')\\\n",
" .decode(\"utf-8\")\n",
" return str(sentence)\n",
"\n",
"def remove_punction(text):\n",
" no_punct = \"\".join(c for c in text if c not in string.punctuation)\n",
" return no_punct\n",
"\n",
"def remove_number(text):\n",
" no_number = ''.join([i for i in text if not i.isdigit()])\n",
" return no_number\n",
"\n",
"def lower_caser(text):\n",
" return text.lower()\n",
"\n",
"def remove_stopwords(text):\n",
" tokens = word_tokenize(text)\n",
" tokens = [t for t in tokens if not t in stopwords.words('portuguese')]\n",
" return \" \".join(map(str.strip, tokens))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"itau_news['Text'] = itau_news['Text'].apply(remove_number)\n",
"itau_news['Text'] = itau_news['Text'].apply(remove_stopwords)\n",
"itau_news['Text'] = itau_news['Text'].apply(lower_caser)\n",
"itau_news['Text'] = itau_news['Text'].apply(strip_accents)\n",
"itau_news['Text'] = itau_news['Text'].apply(remove_punction)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/envs/Python36/lib/python3.6/site-packages/tensorflow/python/framework/indexed_slices.py:434: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
" \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n",
"/opt/conda/envs/Python36/lib/python3.6/site-packages/tensorflow/python/framework/indexed_slices.py:434: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
" \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
]
}
],
"source": [
"from io import BytesIO\n",
"model = load_model(project.get_file('RNNModel.tflearn'))\n",
"train_le = preprocessing.LabelEncoder() # creating a labelencoder"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Your data file was loaded into a botocore.response.StreamingBody object.\n",
"# Please read the documentation of ibm_boto3 and pandas to learn more about the possibilities to load the data.\n",
"# ibm_boto3 documentation: https://ibm.github.io/ibm-cos-sdk-python/\n",
"# pandas documentation: http://pandas.pydata.org/\n",
"streaming_body_2 = client_*****************************.get_object(Bucket='*********************-donotdelete-**-***********', Key='label_encoder.pkl')['Body']\n",
"# add missing __iter__ method so pandas accepts body as file-like object\n",
"if not hasattr(streaming_body_2, \"__iter__\"): streaming_body_2.__iter__ = types.MethodType( __iter__, streaming_body_2 ) \n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.preprocessing.label module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.preprocessing. Anything that cannot be imported from sklearn.preprocessing is now part of the private API.\n",
" warnings.warn(message, FutureWarning)\n",
"/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
" warnings.warn(msg, category=FutureWarning)\n"
]
},
{
"data": {
"text/plain": [
"LabelEncoder()"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from io import BytesIO\n",
"readrawdata = streaming_body_2.read()\n",
"label_encoder = pickle.load(BytesIO(readrawdata))\n",
"train_le.fit(label_encoder)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def RNN(Text):\n",
" test_texts = [Text]\n",
" tokenizer = Tokenizer(num_words=n_words)\n",
" tokenizer.fit_on_texts(test_texts)\n",
" test_sequences = tokenizer.texts_to_sequences(test_texts)\n",
" test_input = pad_sequences(test_sequences, maxlen=DIM_GLOVE)\n",
" test_predictions_probas = model.predict(test_input)\n",
" test_predictions = test_predictions_probas.argmax(axis=-1)\n",
" test_intent_predictions = train_le.inverse_transform(test_predictions)\n",
" df_probas = pd.DataFrame(test_predictions_probas)\n",
" df_intent = pd.DataFrame(test_intent_predictions)\n",
" frames = [df_probas, df_intent]\n",
" result = pd.concat(frames)\n",
" result = pd.concat([df_probas, df_intent], axis=1)\n",
" x = result.transpose()[0:104] \n",
" x.columns = ['probability']\n",
" x = pd.to_numeric(x['probability'][:3])\n",
" x = pd.DataFrame(x)\n",
" labels = pd.DataFrame(label_encoder)\n",
" labels = labels.reset_index()\n",
" labels.columns = ['Labels','number']\n",
" result_table= labels.join(x)\n",
" result_table = result_table.sort_values(by=['probability'], ascending = False)\n",
" result_table.head(5)\n",
" return result_table.head(5).reset_index(drop = True)['number'][0]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"itau_news['Class'] = itau_news['Text'].apply(RNN)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"feature_news = feature_news.append(itau_news[['Date','Link','Class']]).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'project' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-1-91a2b3e38107>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m## Overwrite the csv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mproject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeature_news\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfile_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'itau_news.csv'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0moverwrite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'project' is not defined"
]
}
],
"source": [
"## Overwrite the csv\n",
"project.save_data(data=feature_news.to_csv(),file_name='itau_news.csv',overwrite=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment