Skip to content

Instantly share code, notes, and snippets.

@eduardodx
Created March 22, 2021 16:11
Show Gist options
  • Save eduardodx/6b233950758f2b2ed8238c1b8d252829 to your computer and use it in GitHub Desktop.
Save eduardodx/6b233950758f2b2ed8238c1b8d252829 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "nlp-lista-02-eduardo-souza.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "41rLmp87qizR",
"outputId": "753adc94-59ae-43bc-bf2a-e824821ca7a4"
},
"source": [
"! pip install --upgrade pip spacy==3.0.5"
],
"execution_count": 45,
"outputs": [
{
"output_type": "stream",
"text": [
"Collecting pip\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/fe/ef/60d7ba03b5c442309ef42e7d69959f73aacccd0d86008362a681c4698e83/pip-21.0.1-py3-none-any.whl (1.5MB)\n",
"\u001b[K |████████████████████████████████| 1.5MB 5.9MB/s \n",
"\u001b[?25hCollecting spacy==3.0.5\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/3a/70/a0b8bd0cb54d8739ba4d6fb3458785c3b9b812b7fbe93b0f10beb1a53ada/spacy-3.0.5-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)\n",
"\u001b[K |████████████████████████████████| 12.8MB 13.6MB/s \n",
"\u001b[?25hCollecting catalogue<2.1.0,>=2.0.1\n",
" Downloading https://files.pythonhosted.org/packages/48/5c/493a2f3bb0eac17b1d48129ecfd251f0520b6c89493e9fd0522f534a9e4a/catalogue-2.0.1-py3-none-any.whl\n",
"Collecting srsly<3.0.0,>=2.4.0\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/19/54/76982427ceb495dd19ff982c966708c624b85e03c45bf1912feaf60c7b2d/srsly-2.4.0-cp37-cp37m-manylinux2014_x86_64.whl (456kB)\n",
"\u001b[K |████████████████████████████████| 460kB 39.9MB/s \n",
"\u001b[?25hRequirement already satisfied, skipping upgrade: jinja2 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (2.11.3)\n",
"Requirement already satisfied, skipping upgrade: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (4.41.1)\n",
"Requirement already satisfied, skipping upgrade: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (1.0.5)\n",
"Requirement already satisfied, skipping upgrade: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (1.19.5)\n",
"Requirement already satisfied, skipping upgrade: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (2.0.5)\n",
"Requirement already satisfied, skipping upgrade: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (20.9)\n",
"Requirement already satisfied, skipping upgrade: typing-extensions<4.0.0.0,>=3.7.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (3.7.4.3)\n",
"Collecting thinc<8.1.0,>=8.0.2\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/e3/08/20e707519bcded1a0caa6fd024b767ac79e4e5d0fb92266bb7dcf735e338/thinc-8.0.2-cp37-cp37m-manylinux2014_x86_64.whl (1.1MB)\n",
"\u001b[K |████████████████████████████████| 1.1MB 52.1MB/s \n",
"\u001b[?25hRequirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (54.1.2)\n",
"Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (2.23.0)\n",
"Collecting pydantic<1.8.0,>=1.7.1\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/b3/0a/52ae1c659fc08f13dd7c0ae07b88e4f807ad83fb9954a59b0b0a3d1a8ab6/pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl (9.1MB)\n",
"\u001b[K |████████████████████████████████| 9.1MB 52.1MB/s \n",
"\u001b[?25hCollecting spacy-legacy<3.1.0,>=3.0.0\n",
" Downloading https://files.pythonhosted.org/packages/65/d5/6c58fc97f3098775e46d8202bf248752e626a8096a0ae9d76aa7c485a09c/spacy_legacy-3.0.1-py2.py3-none-any.whl\n",
"Requirement already satisfied, skipping upgrade: wasabi<1.1.0,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (0.8.2)\n",
"Requirement already satisfied, skipping upgrade: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (3.0.5)\n",
"Collecting typer<0.4.0,>=0.3.0\n",
" Downloading https://files.pythonhosted.org/packages/90/34/d138832f6945432c638f32137e6c79a3b682f06a63c488dcfaca6b166c64/typer-0.3.2-py3-none-any.whl\n",
"Requirement already satisfied, skipping upgrade: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (3.7.2)\n",
"Requirement already satisfied, skipping upgrade: blis<0.8.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (0.4.1)\n",
"Collecting pathy>=0.3.5\n",
" Downloading https://files.pythonhosted.org/packages/a2/53/97dc0197cca9357369b3b71bf300896cf2d3604fa60ffaaf5cbc277de7de/pathy-0.4.0-py3-none-any.whl\n",
"Requirement already satisfied, skipping upgrade: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->spacy==3.0.5) (1.1.1)\n",
"Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->spacy==3.0.5) (2.4.7)\n",
"Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy==3.0.5) (1.24.3)\n",
"Requirement already satisfied, skipping upgrade: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy==3.0.5) (2.10)\n",
"Requirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy==3.0.5) (3.0.4)\n",
"Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy==3.0.5) (2020.12.5)\n",
"Requirement already satisfied, skipping upgrade: click<7.2.0,>=7.1.1 in /usr/local/lib/python3.7/dist-packages (from typer<0.4.0,>=0.3.0->spacy==3.0.5) (7.1.2)\n",
"Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->spacy==3.0.5) (3.4.1)\n",
"Collecting smart-open<4.0.0,>=2.2.0\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/11/9a/ba2d5f67f25e8d5bbf2fcec7a99b1e38428e83cb715f64dd179ca43a11bb/smart_open-3.0.0.tar.gz (113kB)\n",
"\u001b[K |████████████████████████████████| 122kB 42.9MB/s \n",
"\u001b[?25hBuilding wheels for collected packages: smart-open\n",
" Building wheel for smart-open (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for smart-open: filename=smart_open-3.0.0-cp37-none-any.whl size=107098 sha256=810002469863a684a53abc41921783aa00acdd642809ed483f51579a22fbca1f\n",
" Stored in directory: /root/.cache/pip/wheels/18/88/7c/f06dabd5e9cabe02d2269167bcacbbf9b47d0c0ff7d6ebcb78\n",
"Successfully built smart-open\n",
"Installing collected packages: pip, catalogue, srsly, pydantic, thinc, spacy-legacy, typer, smart-open, pathy, spacy\n",
" Found existing installation: pip 19.3.1\n",
" Uninstalling pip-19.3.1:\n",
" Successfully uninstalled pip-19.3.1\n",
" Found existing installation: catalogue 1.0.0\n",
" Uninstalling catalogue-1.0.0:\n",
" Successfully uninstalled catalogue-1.0.0\n",
" Found existing installation: srsly 1.0.5\n",
" Uninstalling srsly-1.0.5:\n",
" Successfully uninstalled srsly-1.0.5\n",
" Found existing installation: thinc 7.4.0\n",
" Uninstalling thinc-7.4.0:\n",
" Successfully uninstalled thinc-7.4.0\n",
" Found existing installation: smart-open 4.2.0\n",
" Uninstalling smart-open-4.2.0:\n",
" Successfully uninstalled smart-open-4.2.0\n",
" Found existing installation: spacy 2.2.4\n",
" Uninstalling spacy-2.2.4:\n",
" Successfully uninstalled spacy-2.2.4\n",
"Successfully installed catalogue-2.0.1 pathy-0.4.0 pip-21.0.1 pydantic-1.7.3 smart-open-3.0.0 spacy-3.0.5 spacy-legacy-3.0.1 srsly-2.4.0 thinc-8.0.2 typer-0.3.2\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.colab-display-data+json": {
"pip_warning": {
"packages": [
"catalogue",
"spacy",
"srsly",
"thinc"
]
}
}
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Bm-k493ll3vp",
"outputId": "19cac8a1-eee3-44fa-cbea-ee741bfbdc16"
},
"source": [
"! python -m spacy download pt_core_news_lg"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"2021-03-22 16:02:06.454823: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n",
"Collecting pt-core-news-lg==3.0.0\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_lg-3.0.0/pt_core_news_lg-3.0.0-py3-none-any.whl (578.1 MB)\n",
"\u001b[K |████████████████████████████████| 578.1 MB 7.9 kB/s \n",
"\u001b[?25hRequirement already satisfied: spacy<3.1.0,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from pt-core-news-lg==3.0.0) (3.0.5)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.0.5)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (4.41.1)\n",
"Requirement already satisfied: pydantic<1.8.0,>=1.7.1 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (1.7.3)\n",
"Requirement already satisfied: typing-extensions<4.0.0.0,>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.7.4.3)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (54.1.2)\n",
"Requirement already satisfied: pathy>=0.3.5 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (0.4.0)\n",
"Requirement already satisfied: thinc<8.1.0,>=8.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (8.0.2)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (1.0.5)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.0.5)\n",
"Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (1.19.5)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.11.3)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.0.1)\n",
"Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.7.2)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.0.1)\n",
"Requirement already satisfied: blis<0.8.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (0.4.1)\n",
"Requirement already satisfied: typer<0.4.0,>=0.3.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (0.3.2)\n",
"Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (0.8.2)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.23.0)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (20.9)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.4.0)\n",
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.4.1)\n",
"Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.4.7)\n",
"Requirement already satisfied: smart-open<4.0.0,>=2.2.0 in /usr/local/lib/python3.7/dist-packages (from pathy>=0.3.5->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.0.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2020.12.5)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.0.4)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.10)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (1.24.3)\n",
"Requirement already satisfied: click<7.2.0,>=7.1.1 in /usr/local/lib/python3.7/dist-packages (from typer<0.4.0,>=0.3.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (7.1.2)\n",
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (1.1.1)\n",
"Installing collected packages: pt-core-news-lg\n",
"Successfully installed pt-core-news-lg-3.0.0\n",
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
"You can now load the package via spacy.load('pt_core_news_lg')\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "RvTOLOsbj2j6"
},
"source": [
"import pt_core_news_lg\n",
"import pandas as pd"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "tXDyhwa6kPmE"
},
"source": [
"nlp = pt_core_news_lg.load()"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bS7T3Eu7kSBn",
"outputId": "c1e02b59-3436-4c34-8bf2-aac9ba5e7ea0"
},
"source": [
"df = pd.read_csv(\"https://github.com/b2wdigital/b2w-reviews01/raw/master/B2W-Reviews01.csv\", delimiter=\";\")"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "EuBDTgnukdJt",
"outputId": "bfe390b6-3f03-40ac-c5f7-0545556609eb"
},
"source": [
"df.head()"
],
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>submission_date</th>\n",
" <th>reviewer_id</th>\n",
" <th>product_id</th>\n",
" <th>product_name</th>\n",
" <th>product_brand</th>\n",
" <th>site_category_lv1</th>\n",
" <th>site_category_lv2</th>\n",
" <th>review_title</th>\n",
" <th>overall_rating</th>\n",
" <th>recommend_to_a_friend</th>\n",
" <th>review_text</th>\n",
" <th>reviewer_birth_year</th>\n",
" <th>reviewer_gender</th>\n",
" <th>reviewer_state</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2018-01-01 00:11:28</td>\n",
" <td>d0fb1ca69422530334178f5c8624aa7a99da47907c44de...</td>\n",
" <td>132532965</td>\n",
" <td>Notebook Asus Vivobook Max X541NA-GO472T Intel...</td>\n",
" <td>NaN</td>\n",
" <td>Informática</td>\n",
" <td>Notebook</td>\n",
" <td>Bom</td>\n",
" <td>4</td>\n",
" <td>Yes</td>\n",
" <td>Estou contente com a compra entrega rápida o ú...</td>\n",
" <td>1958.0</td>\n",
" <td>F</td>\n",
" <td>RJ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2018-01-01 00:13:48</td>\n",
" <td>014d6dc5a10aed1ff1e6f349fb2b059a2d3de511c7538a...</td>\n",
" <td>22562178</td>\n",
" <td>Copo Acrílico Com Canudo 500ml Rocie</td>\n",
" <td>NaN</td>\n",
" <td>Utilidades Domésticas</td>\n",
" <td>Copos, Taças e Canecas</td>\n",
" <td>Preço imbatível, ótima qualidade</td>\n",
" <td>4</td>\n",
" <td>Yes</td>\n",
" <td>Por apenas R$1994.20,eu consegui comprar esse ...</td>\n",
" <td>1996.0</td>\n",
" <td>M</td>\n",
" <td>SC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-01-01 00:26:02</td>\n",
" <td>44f2c8edd93471926fff601274b8b2b5c4824e386ae4f2...</td>\n",
" <td>113022329</td>\n",
" <td>Panela de Pressão Elétrica Philips Walita Dail...</td>\n",
" <td>philips walita</td>\n",
" <td>Eletroportáteis</td>\n",
" <td>Panela Elétrica</td>\n",
" <td>ATENDE TODAS AS EXPECTATIVA.</td>\n",
" <td>4</td>\n",
" <td>Yes</td>\n",
" <td>SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...</td>\n",
" <td>1984.0</td>\n",
" <td>M</td>\n",
" <td>SP</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2018-01-01 00:35:54</td>\n",
" <td>ce741665c1764ab2d77539e18d0e4f66dde6213c9f0863...</td>\n",
" <td>113851581</td>\n",
" <td>Betoneira Columbus - Roma Brinquedos</td>\n",
" <td>roma jensen</td>\n",
" <td>Brinquedos</td>\n",
" <td>Veículos de Brinquedo</td>\n",
" <td>presente mais que desejado</td>\n",
" <td>4</td>\n",
" <td>Yes</td>\n",
" <td>MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...</td>\n",
" <td>1985.0</td>\n",
" <td>F</td>\n",
" <td>SP</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2018-01-01 01:00:28</td>\n",
" <td>7d7b6b18dda804a897359276cef0ca252f9932bf4b5c8e...</td>\n",
" <td>131788803</td>\n",
" <td>Smart TV LED 43\" LG 43UJ6525 Ultra HD 4K com C...</td>\n",
" <td>lg</td>\n",
" <td>TV e Home Theater</td>\n",
" <td>TV</td>\n",
" <td>Sem duvidas, excelente</td>\n",
" <td>5</td>\n",
" <td>Yes</td>\n",
" <td>A entrega foi no prazo, as americanas estão de...</td>\n",
" <td>1994.0</td>\n",
" <td>M</td>\n",
" <td>MG</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" submission_date ... reviewer_state\n",
"0 2018-01-01 00:11:28 ... RJ\n",
"1 2018-01-01 00:13:48 ... SC\n",
"2 2018-01-01 00:26:02 ... SP\n",
"3 2018-01-01 00:35:54 ... SP\n",
"4 2018-01-01 01:00:28 ... MG\n",
"\n",
"[5 rows x 14 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "dF16h_hhlAIY"
},
"source": [
"## Text-processing"
]
},
{
"cell_type": "code",
"metadata": {
"id": "-gc5_NRalBx7"
},
"source": [
"original_docs = df.review_text"
],
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "b53ALLZ3lL-g"
},
"source": [
"def text_processing(text):\n",
" doc = nlp(text)\n",
" processed = []\n",
"\n",
" for token in doc:\n",
" lexeme = doc.vocab[token.text]\n",
"\n",
" if token.is_stop or token.is_punct or token.like_num or token.is_space:\n",
" continue\n",
"\n",
" processed.append(f\"{token.lemma_.lower()}-{token.pos_}-{token.is_oov}\")\n",
"\n",
" return \" \".join(processed)"
],
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "smJDfkjtlSTu"
},
"source": [
"docs = list(original_docs.map(text_processing))"
],
"execution_count": 18,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nOsuG7NclXeK",
"outputId": "a979f31c-c290-4264-be5e-2ab0f629a42c"
},
"source": [
"docs"
],
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['contentar-ADJ-False comprar-NOUN-False entregar-NOUN-False rápido-ADJ-False único-ADJ-False problema-NOUN-False americanas-PROPN-False haver-VERB-False trocar-NOUN-False devolução-NOUN-False produto-NOUN-False consumidor-NOUN-False problema-NOUN-False esperar-NOUN-False',\n",
" 'r$-SYM-False 1994.20,eu-NUM-True conseguir-VERB-False comprar-VERB-False lindar-ADJ-False copar-NOUN-False acrílico-NOUN-False',\n",
" 'supera-NOUN-False agilidade-NOUN-False praticidade-NOUN-False panelas-PROPN-False elétricas-PROPN-False costumo-PROPN-True panela-PROPN-False cozimento-PROPN-True arroz-PROPN-False japonesa-PROPN-False leva-VERB-False +-NOUN-False minutos-NOUN-False panela-NOUN-False rápido-NOUN-False exatamente-PROPN-False minutos-NOUN-False recomendo-DET-False',\n",
" 'filho-PROPN-False amou-PROPN-False verdade-PROPN-False tantos-PROPN-False detalhes-NOUN-False',\n",
" 'entregar-NOUN-False prazo-NOUN-False americano-NOUN-False parabém-NOUN-False smart-ADJ-False tv-NOUN-False navegação-NOUN-False internete-NOUN-False aplicativo-NOUN-False excelente-ADJ-False travar-VERB-False falar-VERB-False imagem-NOUN-False surpreender-VERB-False recomendar-VERB-False',\n",
" 'excelente-ADJ-False produto-NOUN-False material-NOUN-False acrílico-ADJ-False super-ADJ-False resistente-ADJ-False adamantio-NOUN-True milagre-NOUN-False bebido-NOUN-False sugiro-VERB-False aproveitar-VERB-False promoção-NOUN-False acabar-VERB-False',\n",
" 'produto-NOUN-False mto-ADV-False garrafa-NOUN-False vc-PROPN-False servir-VERB-False água-NOUN-False pro-ADP-False megazord-NOUN-False to-SCONJ-False pensar-VERB-False vender-VERB-False tv-NOUN-False pra-SCONJ-False comprar-VERB-False garrafa-NOUN-False recomendo-NOUN-False',\n",
" 'produto-NOUN-False excelente-ADJ-False qualidade-NOUN-False câmera-NOUN-False desenvolvimento-NOUN-False android-PROPN-False rapidez-NOUN-False',\n",
" 'barulhar-NOUN-False minimo-ADJ-False ventar-NOUN-False forte-ADJ-False velocidade-NOUN-False',\n",
" 'produto-PROPN-False nao-PROPN-False entregue-PROPN-False americanas-PROPN-False descontando-VERB-True fatura-NOUN-False cartão-NOUN-False']"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "s3wPbWsElCws"
},
"source": [
"## Feature-extraction"
]
},
{
"cell_type": "code",
"metadata": {
"id": "qPsNuX96lD7A"
},
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer"
],
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "fXgzeTVOrhlu"
},
"source": [
"vectorizer = TfidfVectorizer()"
],
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "pum_CErnrjV1"
},
"source": [
"X = vectorizer.fit_transform(docs)"
],
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pn33rA0proKY",
"outputId": "0d4d28bb-d22d-49ef-8c72-c8348cd33310"
},
"source": [
"# cada linha é um documento e cada coluna corresponde a uma palavra\n",
"print(X)"
],
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"text": [
" (0, 35)\t0.1347699687619662\n",
" (0, 21)\t0.1347699687619662\n",
" (0, 70)\t0.08002921702608817\n",
" (0, 31)\t0.1347699687619662\n",
" (0, 90)\t0.1347699687619662\n",
" (0, 97)\t0.06569483075589849\n",
" (0, 45)\t0.1347699687619662\n",
" (0, 72)\t0.07234832342842883\n",
" (0, 9)\t0.11456678602912203\n",
" (0, 69)\t0.2695399375239324\n",
" (0, 100)\t0.1347699687619662\n",
" (0, 78)\t0.11456678602912203\n",
" (0, 33)\t0.11456678602912203\n",
" (0, 59)\t0.44844461572533584\n",
" (0, 19)\t0.10023239975893235\n",
" (0, 40)\t0.6975805133505224\n",
" (0, 5)\t0.2170449702852865\n",
" (0, 22)\t0.1347699687619662\n",
" (1, 3)\t0.20979116088351724\n",
" (1, 23)\t0.24678660525242735\n",
" (1, 50)\t0.24678660525242735\n",
" (1, 20)\t0.24678660525242735\n",
" (1, 91)\t0.16318241313102977\n",
" (1, 60)\t0.24678660525242735\n",
" (1, 36)\t0.24678660525242735\n",
" :\t:\n",
" (7, 70)\t0.1457589675335856\n",
" (7, 72)\t0.13176958762792945\n",
" (7, 59)\t0.4537566724108096\n",
" (7, 40)\t0.6352593413751334\n",
" (7, 5)\t0.13176958762792945\n",
" (8, 94)\t0.3042860691298508\n",
" (8, 43)\t0.3042860691298508\n",
" (8, 96)\t0.3042860691298508\n",
" (8, 54)\t0.3042860691298508\n",
" (8, 16)\t0.3042860691298508\n",
" (8, 59)\t0.33750211711364186\n",
" (8, 40)\t0.5625035285227364\n",
" (8, 5)\t0.3266987021871967\n",
" (9, 18)\t0.24376622300595205\n",
" (9, 41)\t0.24376622300595205\n",
" (9, 27)\t0.24376622300595205\n",
" (9, 34)\t0.24376622300595205\n",
" (9, 57)\t0.24376622300595205\n",
" (9, 91)\t0.16118524937469939\n",
" (9, 70)\t0.1447534650618593\n",
" (9, 97)\t0.11882603306575845\n",
" (9, 72)\t0.5234423575213665\n",
" (9, 9)\t0.20722356003195608\n",
" (9, 59)\t0.1802505924344424\n",
" (9, 40)\t0.5407517773033272\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment