Skip to content

Instantly share code, notes, and snippets.

@miguel-castillo
Last active January 2, 2021 14:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save miguel-castillo/8af16dee70603cb1d551359a2c3941ab to your computer and use it in GitHub Desktop.
Save miguel-castillo/8af16dee70603cb1d551359a2c3941ab to your computer and use it in GitHub Desktop.
covid_tweets_project.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "covid_tweets_project.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyN6F0otS2mpo1c3kqh8xaGO",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/miguel-castillo/8af16dee70603cb1d551359a2c3941ab/covid_tweets_project.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "aDGKhafecaV1",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"outputId": "f2725fa7-43da-48de-eb35-d76462163dee"
},
"source": [
"!pip install wordninja\n",
"!pip install autocorrect\n",
"!pip install wordsegment\n",
"!apt-get update\n",
"!apt install openjdk-8-jdk\n",
"!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java\n",
"!pip install language-check\n",
"!pip install pycontractions\n",
"!python -m spacy link en_core_web_md en_md"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Collecting wordninja\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/30/15/abe4af50f4be92b60c25e43c1c64d08453b51e46c32981d80b3aebec0260/wordninja-2.0.0.tar.gz (541kB)\n",
"\r\u001b[K |▋ | 10kB 13.8MB/s eta 0:00:01\r\u001b[K |█▏ | 20kB 1.8MB/s eta 0:00:01\r\u001b[K |█▉ | 30kB 2.6MB/s eta 0:00:01\r\u001b[K |██▍ | 40kB 3.4MB/s eta 0:00:01\r\u001b[K |███ | 51kB 2.2MB/s eta 0:00:01\r\u001b[K |███▋ | 61kB 2.5MB/s eta 0:00:01\r\u001b[K |████▎ | 71kB 2.9MB/s eta 0:00:01\r\u001b[K |████▉ | 81kB 3.3MB/s eta 0:00:01\r\u001b[K |█████▌ | 92kB 2.6MB/s eta 0:00:01\r\u001b[K |██████ | 102kB 2.8MB/s eta 0:00:01\r\u001b[K |██████▋ | 112kB 2.8MB/s eta 0:00:01\r\u001b[K |███████▎ | 122kB 2.8MB/s eta 0:00:01\r\u001b[K |███████▉ | 133kB 2.8MB/s eta 0:00:01\r\u001b[K |████████▌ | 143kB 2.8MB/s eta 0:00:01\r\u001b[K |█████████ | 153kB 2.8MB/s eta 0:00:01\r\u001b[K |█████████▊ | 163kB 2.8MB/s eta 0:00:01\r\u001b[K |██████████▎ | 174kB 2.8MB/s eta 0:00:01\r\u001b[K |███████████ | 184kB 2.8MB/s eta 0:00:01\r\u001b[K |███████████▌ | 194kB 2.8MB/s eta 0:00:01\r\u001b[K |████████████ | 204kB 2.8MB/s eta 0:00:01\r\u001b[K |████████████▊ | 215kB 2.8MB/s eta 0:00:01\r\u001b[K |█████████████▎ | 225kB 2.8MB/s eta 0:00:01\r\u001b[K |██████████████ | 235kB 2.8MB/s eta 0:00:01\r\u001b[K |██████████████▌ | 245kB 2.8MB/s eta 0:00:01\r\u001b[K |███████████████▏ | 256kB 2.8MB/s eta 0:00:01\r\u001b[K |███████████████▊ | 266kB 2.8MB/s eta 0:00:01\r\u001b[K |████████████████▍ | 276kB 2.8MB/s eta 0:00:01\r\u001b[K |█████████████████ | 286kB 2.8MB/s eta 0:00:01\r\u001b[K |█████████████████▌ | 296kB 2.8MB/s eta 0:00:01\r\u001b[K |██████████████████▏ | 307kB 2.8MB/s eta 0:00:01\r\u001b[K |██████████████████▊ | 317kB 2.8MB/s eta 0:00:01\r\u001b[K |███████████████████▍ | 327kB 2.8MB/s eta 0:00:01\r\u001b[K |████████████████████ | 337kB 2.8MB/s eta 0:00:01\r\u001b[K |████████████████████▋ | 348kB 2.8MB/s eta 0:00:01\r\u001b[K |█████████████████████▏ | 358kB 2.8MB/s eta 0:00:01\r\u001b[K |█████████████████████▉ | 368kB 2.8MB/s eta 0:00:01\r\u001b[K |██████████████████████▍ | 378kB 2.8MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 389kB 2.8MB/s eta 0:00:01\r\u001b[K |███████████████████████▋ | 399kB 2.8MB/s eta 0:00:01\r\u001b[K |████████████████████████▏ | 409kB 2.8MB/s eta 0:00:01\r\u001b[K |████████████████████████▉ | 419kB 2.8MB/s eta 0:00:01\r\u001b[K |█████████████████████████▍ | 430kB 2.8MB/s eta 0:00:01\r\u001b[K |██████████████████████████ | 440kB 2.8MB/s eta 0:00:01\r\u001b[K |██████████████████████████▋ | 450kB 2.8MB/s eta 0:00:01\r\u001b[K |███████████████████████████▎ | 460kB 2.8MB/s eta 0:00:01\r\u001b[K |███████████████████████████▉ | 471kB 2.8MB/s eta 0:00:01\r\u001b[K |████████████████████████████▍ | 481kB 2.8MB/s eta 0:00:01\r\u001b[K |█████████████████████████████ | 491kB 2.8MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▋ | 501kB 2.8MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▎ | 512kB 2.8MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▉ | 522kB 2.8MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▌| 532kB 2.8MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 542kB 2.8MB/s \n",
"\u001b[?25hBuilding wheels for collected packages: wordninja\n",
" Building wheel for wordninja (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for wordninja: filename=wordninja-2.0.0-cp36-none-any.whl size=541552 sha256=1fe44e2b07a6f4c9d17b00033a612dadc6b19cb2625d248b9d72d670a70c609e\n",
" Stored in directory: /root/.cache/pip/wheels/22/46/06/9b6d10ed02c85e93c3bb33ac50e2d368b2586248f192a2e22a\n",
"Successfully built wordninja\n",
"Installing collected packages: wordninja\n",
"Successfully installed wordninja-2.0.0\n",
"Collecting autocorrect\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/a9/b0/a1d628fa192e8ebf124b4cebc2a42b4e3aa65b8052fdf4888e04fadf3e8d/autocorrect-1.1.0.tar.gz (1.8MB)\n",
"\u001b[K |████████████████████████████████| 1.8MB 2.8MB/s \n",
"\u001b[?25hBuilding wheels for collected packages: autocorrect\n",
" Building wheel for autocorrect (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for autocorrect: filename=autocorrect-1.1.0-cp36-none-any.whl size=1810772 sha256=619ac3b958d6c51e7df402dc96d5dba21a7165a698eb8f1e43be8df2df100dc8\n",
" Stored in directory: /root/.cache/pip/wheels/78/7f/b1/527522820ae623df6a2dbe14f778d23adaea4bebe43f7ebcfe\n",
"Successfully built autocorrect\n",
"Installing collected packages: autocorrect\n",
"Successfully installed autocorrect-1.1.0\n",
"Collecting wordsegment\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/cf/6c/e6f4734d6f7d28305f52ec81377d7ce7d1856b97b814278e9960183235ad/wordsegment-1.3.1-py2.py3-none-any.whl (4.8MB)\n",
"\u001b[K |████████████████████████████████| 4.8MB 2.8MB/s \n",
"\u001b[?25hInstalling collected packages: wordsegment\n",
"Successfully installed wordsegment-1.3.1\n",
"Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n",
"Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease\n",
"Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]\n",
"Hit:4 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n",
"Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease\n",
"Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n",
"Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 Release\n",
"Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release\n",
"Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n",
"Get:10 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease [15.4 kB]\n",
"Get:11 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [844 kB]\n",
"Get:12 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [908 kB]\n",
"Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]\n",
"Get:16 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic/main Sources [1,814 kB]\n",
"Get:17 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [1,205 kB]\n",
"Get:18 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [1,376 kB]\n",
"Get:19 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic/main amd64 Packages [875 kB]\n",
"Fetched 7,294 kB in 3s (2,178 kB/s)\n",
"Reading package lists... Done\n",
"Reading package lists... Done\n",
"Building dependency tree \n",
"Reading state information... Done\n",
"The following additional packages will be installed:\n",
" fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java\n",
" libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jdk-headless openjdk-8-jre\n",
" openjdk-8-jre-headless x11-utils\n",
"Suggested packages:\n",
" openjdk-8-demo openjdk-8-source visualvm icedtea-8-plugin libnss-mdns\n",
" fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhei\n",
" fonts-wqy-zenhei fonts-indic mesa-utils\n",
"The following NEW packages will be installed:\n",
" fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java\n",
" libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jdk openjdk-8-jdk-headless\n",
" openjdk-8-jre openjdk-8-jre-headless x11-utils\n",
"0 upgraded, 10 newly installed, 0 to remove and 32 not upgraded.\n",
"Need to get 40.7 MB of archives.\n",
"After this operation, 153 MB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]\n",
"Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-core all 2.37-1 [1,041 kB]\n",
"Get:3 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-extra all 2.37-1 [1,953 kB]\n",
"Get:4 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]\n",
"Get:5 http://archive.ubuntu.com/ubuntu bionic/main amd64 libatk-wrapper-java all 0.33.3-20ubuntu0.1 [34.7 kB]\n",
"Get:6 http://archive.ubuntu.com/ubuntu bionic/main amd64 libatk-wrapper-java-jni amd64 0.33.3-20ubuntu0.1 [28.3 kB]\n",
"Get:7 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jre-headless amd64 8u252-b09-1~18.04 [27.5 MB]\n",
"Get:8 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jre amd64 8u252-b09-1~18.04 [69.8 kB]\n",
"Get:9 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jdk-headless amd64 8u252-b09-1~18.04 [8,250 kB]\n",
"Get:10 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jdk amd64 8u252-b09-1~18.04 [1,622 kB]\n",
"Fetched 40.7 MB in 2s (19.9 MB/s)\n",
"Selecting previously unselected package libxxf86dga1:amd64.\n",
"(Reading database ... 144429 files and directories currently installed.)\n",
"Preparing to unpack .../0-libxxf86dga1_2%3a1.1.4-1_amd64.deb ...\n",
"Unpacking libxxf86dga1:amd64 (2:1.1.4-1) ...\n",
"Selecting previously unselected package fonts-dejavu-core.\n",
"Preparing to unpack .../1-fonts-dejavu-core_2.37-1_all.deb ...\n",
"Unpacking fonts-dejavu-core (2.37-1) ...\n",
"Selecting previously unselected package fonts-dejavu-extra.\n",
"Preparing to unpack .../2-fonts-dejavu-extra_2.37-1_all.deb ...\n",
"Unpacking fonts-dejavu-extra (2.37-1) ...\n",
"Selecting previously unselected package x11-utils.\n",
"Preparing to unpack .../3-x11-utils_7.7+3build1_amd64.deb ...\n",
"Unpacking x11-utils (7.7+3build1) ...\n",
"Selecting previously unselected package libatk-wrapper-java.\n",
"Preparing to unpack .../4-libatk-wrapper-java_0.33.3-20ubuntu0.1_all.deb ...\n",
"Unpacking libatk-wrapper-java (0.33.3-20ubuntu0.1) ...\n",
"Selecting previously unselected package libatk-wrapper-java-jni:amd64.\n",
"Preparing to unpack .../5-libatk-wrapper-java-jni_0.33.3-20ubuntu0.1_amd64.deb ...\n",
"Unpacking libatk-wrapper-java-jni:amd64 (0.33.3-20ubuntu0.1) ...\n",
"Selecting previously unselected package openjdk-8-jre-headless:amd64.\n",
"Preparing to unpack .../6-openjdk-8-jre-headless_8u252-b09-1~18.04_amd64.deb ...\n",
"Unpacking openjdk-8-jre-headless:amd64 (8u252-b09-1~18.04) ...\n",
"Selecting previously unselected package openjdk-8-jre:amd64.\n",
"Preparing to unpack .../7-openjdk-8-jre_8u252-b09-1~18.04_amd64.deb ...\n",
"Unpacking openjdk-8-jre:amd64 (8u252-b09-1~18.04) ...\n",
"Selecting previously unselected package openjdk-8-jdk-headless:amd64.\n",
"Preparing to unpack .../8-openjdk-8-jdk-headless_8u252-b09-1~18.04_amd64.deb ...\n",
"Unpacking openjdk-8-jdk-headless:amd64 (8u252-b09-1~18.04) ...\n",
"Selecting previously unselected package openjdk-8-jdk:amd64.\n",
"Preparing to unpack .../9-openjdk-8-jdk_8u252-b09-1~18.04_amd64.deb ...\n",
"Unpacking openjdk-8-jdk:amd64 (8u252-b09-1~18.04) ...\n",
"Setting up fonts-dejavu-core (2.37-1) ...\n",
"Setting up libxxf86dga1:amd64 (2:1.1.4-1) ...\n",
"Setting up fonts-dejavu-extra (2.37-1) ...\n",
"Setting up openjdk-8-jre-headless:amd64 (8u252-b09-1~18.04) ...\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/orbd to provide /usr/bin/orbd (orbd) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/servertool to provide /usr/bin/servertool (servertool) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/tnameserv to provide /usr/bin/tnameserv (tnameserv) in auto mode\n",
"Setting up openjdk-8-jdk-headless:amd64 (8u252-b09-1~18.04) ...\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/idlj to provide /usr/bin/idlj (idlj) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/wsimport to provide /usr/bin/wsimport (wsimport) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/jsadebugd to provide /usr/bin/jsadebugd (jsadebugd) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/native2ascii to provide /usr/bin/native2ascii (native2ascii) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/javah to provide /usr/bin/javah (javah) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/clhsdb to provide /usr/bin/clhsdb (clhsdb) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/extcheck to provide /usr/bin/extcheck (extcheck) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/hsdb to provide /usr/bin/hsdb (hsdb) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/schemagen to provide /usr/bin/schemagen (schemagen) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/xjc to provide /usr/bin/xjc (xjc) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/jhat to provide /usr/bin/jhat (jhat) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/wsgen to provide /usr/bin/wsgen (wsgen) in auto mode\n",
"Setting up x11-utils (7.7+3build1) ...\n",
"Setting up libatk-wrapper-java (0.33.3-20ubuntu0.1) ...\n",
"Setting up libatk-wrapper-java-jni:amd64 (0.33.3-20ubuntu0.1) ...\n",
"Setting up openjdk-8-jre:amd64 (8u252-b09-1~18.04) ...\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/policytool to provide /usr/bin/policytool (policytool) in auto mode\n",
"Setting up openjdk-8-jdk:amd64 (8u252-b09-1~18.04) ...\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/appletviewer to provide /usr/bin/appletviewer (appletviewer) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/jconsole to provide /usr/bin/jconsole (jconsole) in auto mode\n",
"Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
"Processing triggers for hicolor-icon-theme (0.17-2) ...\n",
"Processing triggers for fontconfig (2.12.6-0ubuntu2) ...\n",
"Processing triggers for mime-support (3.60ubuntu1) ...\n",
"Processing triggers for libc-bin (2.27-3ubuntu1) ...\n",
"/sbin/ldconfig.real: /usr/local/lib/python3.6/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link\n",
"\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java to provide /usr/bin/java (java) in manual mode\n",
"Collecting language-check\n",
" Downloading https://files.pythonhosted.org/packages/97/45/0fd1d3683d6129f30fa09143fa383cdf6dff8bc0d1648f2cf156109cb772/language-check-1.1.tar.gz\n",
"Building wheels for collected packages: language-check\n",
" Building wheel for language-check (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for language-check: filename=language_check-1.1-cp36-none-any.whl size=90190899 sha256=ebd0f3df7c321c9bff2c2aaed4bbc288d3b15ec88ff4fb1d2e55cd8a82cacf35\n",
" Stored in directory: /root/.cache/pip/wheels/d5/46/82/90a89c23eac1837364ed7217a9eed71bc9e6ad4825be93968e\n",
"Successfully built language-check\n",
"Installing collected packages: language-check\n",
"Successfully installed language-check-1.1\n",
"Collecting pycontractions\n",
" Downloading https://files.pythonhosted.org/packages/a6/f5/d3ec9491c530cbc03af32ca2c6b69b0e89660daeb2856b485d90f9d82e5e/pycontractions-2.0.1-py3-none-any.whl\n",
"Requirement already satisfied: pyemd>=0.4.4 in /usr/local/lib/python3.6/dist-packages (from pycontractions) (0.5.1)\n",
"Requirement already satisfied: gensim>=2.0 in /usr/local/lib/python3.6/dist-packages (from pycontractions) (3.6.0)\n",
"Requirement already satisfied: language-check>=1.0 in /usr/local/lib/python3.6/dist-packages (from pycontractions) (1.1)\n",
"Requirement already satisfied: numpy<2.0.0,>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from pyemd>=0.4.4->pycontractions) (1.18.4)\n",
"Requirement already satisfied: six>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from gensim>=2.0->pycontractions) (1.12.0)\n",
"Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.6/dist-packages (from gensim>=2.0->pycontractions) (2.0.0)\n",
"Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.6/dist-packages (from gensim>=2.0->pycontractions) (1.4.1)\n",
"Requirement already satisfied: boto in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim>=2.0->pycontractions) (2.49.0)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim>=2.0->pycontractions) (2.23.0)\n",
"Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim>=2.0->pycontractions) (1.13.3)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim>=2.0->pycontractions) (2.9)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim>=2.0->pycontractions) (2020.4.5.1)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim>=2.0->pycontractions) (3.0.4)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim>=2.0->pycontractions) (1.24.3)\n",
"Requirement already satisfied: botocore<1.17.0,>=1.16.3 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim>=2.0->pycontractions) (1.16.3)\n",
"Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim>=2.0->pycontractions) (0.9.5)\n",
"Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim>=2.0->pycontractions) (0.3.3)\n",
"Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.17.0,>=1.16.3->boto3->smart-open>=1.2.1->gensim>=2.0->pycontractions) (0.15.2)\n",
"Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.17.0,>=1.16.3->boto3->smart-open>=1.2.1->gensim>=2.0->pycontractions) (2.8.1)\n",
"Installing collected packages: pycontractions\n",
"Successfully installed pycontractions-2.0.1\n",
"\n",
"\u001b[38;5;1m✘ Can't locate model data\u001b[0m\n",
"The data should be located in en_core_web_md\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "V3L4oA0xhCRb",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 131
},
"outputId": "c1f201ee-3657-4d53-a648-017a3e6f3c8d"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
"\n",
"Enter your authorization code:\n",
"··········\n",
"Mounted at /content/drive\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "mgHa446oO1rs",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 675
},
"outputId": "af9e517f-5f4d-4bab-ee47-74d042543e1d"
},
"source": [
"!pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.0/en_core_web_md-2.2.0.tar.gz\n",
"!python -m spacy link en_core_web_md en_md"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.0/en_core_web_md-2.2.0.tar.gz\n",
"\u001b[?25l Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.0/en_core_web_md-2.2.0.tar.gz (96.4MB)\n",
"\u001b[K |████████████████████████████████| 96.4MB 48kB/s \n",
"\u001b[?25hRequirement already satisfied: spacy>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from en-core-web-md==2.2.0) (2.2.4)\n",
"Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (0.6.0)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (3.0.2)\n",
"Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (0.4.1)\n",
"Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (1.18.4)\n",
"Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (1.0.2)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (1.0.2)\n",
"Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (1.0.0)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (46.1.3)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (2.0.3)\n",
"Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (7.4.0)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (4.41.1)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (2.23.0)\n",
"Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.0->en-core-web-md==2.2.0) (1.1.3)\n",
"Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.0->en-core-web-md==2.2.0) (1.6.0)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.0->en-core-web-md==2.2.0) (3.0.4)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.0->en-core-web-md==2.2.0) (2.9)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.0->en-core-web-md==2.2.0) (2020.4.5.1)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.0->en-core-web-md==2.2.0) (1.24.3)\n",
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.0->en-core-web-md==2.2.0) (3.1.0)\n",
"Building wheels for collected packages: en-core-web-md\n",
" Building wheel for en-core-web-md (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for en-core-web-md: filename=en_core_web_md-2.2.0-cp36-none-any.whl size=98072934 sha256=21382d52460e1f3f092f376a01009a3bb71a396445cbb300916b615c0ec30c45\n",
" Stored in directory: /root/.cache/pip/wheels/5f/3e/c9/36dd6e13b449fd84cd1f94b72dfbc559daf09f53dbf4e697a3\n",
"Successfully built en-core-web-md\n",
"Installing collected packages: en-core-web-md\n",
"Successfully installed en-core-web-md-2.2.0\n",
"\u001b[38;5;2m✔ Linking successful\u001b[0m\n",
"/usr/local/lib/python3.6/dist-packages/en_core_web_md -->\n",
"/usr/local/lib/python3.6/dist-packages/spacy/data/en_md\n",
"You can now load the model via spacy.load('en_md')\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "nSsnhdBoqCzs",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 168
},
"outputId": "9a52cde6-4dc6-4915-f1e1-11d7ddf632e4"
},
"source": [
"import glob\n",
"import pandas as pd\n",
"import re\n",
"import spacy\n",
"from autocorrect import Speller\n",
"from wordsegment import load, segment\n",
"from pycontractions import Contractions\n",
"import gensim.downloader as api\n",
"import wordninja\n",
"from tqdm import tqdm\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"nlp = spacy.load('en_md')\n",
"\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"\n",
"load()\n",
"\n",
"spell = Speller()\n",
"\n",
"model = api.load(\"glove-twitter-100\")\n",
"\n",
"cont = Contractions(kv_model=model)\n",
"cont.load_models()\n",
"\n",
"path = '/content/drive/My Drive/annotated_tweets'"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n",
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Unzipping corpora/stopwords.zip.\n",
"[=================================================-] 98.9% 382.9/387.1MB downloaded\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:253: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
" 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "KlzK9kbCzFok"
},
"source": [
"pd.options.display.max_colwidth = 400"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bWhU1N4jsAIX"
},
"source": [
"files = glob.glob(path + '/project_*_labeled_data.csv')\n",
"li = []\n",
"for file in files:\n",
" li.append(pd.read_csv(file))\n",
"tweets_df = pd.concat(li)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "mFdV3k04wCGZ"
},
"source": [
"X = tweets_df['Text']\n",
"y = tweets_df['Label']\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "V03wjD308D9D"
},
"source": [
"def clean_tweet(tweet):\n",
" tweet = re.sub(r'https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)', '',\n",
" tweet, flags=re.MULTILINE)\n",
" tweet = re.sub(r'[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)', '',\n",
" tweet, flags=re.MULTILINE)\n",
" tweet = ' '.join(re.sub(\"(@[A-Za-z0-9]+(?:_[a-zA-Z]+)*)|([^0-9A-Za-z \\' \\t])|(\\w+:\\/\\/\\S+)\", \n",
" \"\", tweet).split())\n",
" tweet = list(cont.expand_texts([tweet], precise=True))[0]\n",
" tweet = ' '.join(re.sub(\"('[A-Za-z0-9]+)\", \"\", tweet).split())\n",
" tweet = re.sub(r\"\\d\", \"\", tweet)\n",
" tweet = ' '.join(segment(tweet))\n",
" tweet = ' '.join([spell(w) for w in tweet.split()])\n",
" tweet = tweet.lower()\n",
" text_tokens = word_tokenize(tweet)\n",
" tweet = ' '.join([w for w in text_tokens if not w in stopwords.words()])\n",
"\n",
" tokenise = nlp(tweet)\n",
" tokens = [word.lemma_ if word.lemma_ != \"-PRON-\" else word.lower_ for word in tokenise]\n",
" tweet = ' '.join(tokens)\n",
"\n",
" return tweet"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "gXQMdBdV-tWh",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 92
},
"outputId": "eb808c2b-ffd1-4fa9-dba5-36695eaf741e"
},
"source": [
"import nltk\n",
"nltk.download('punkt')\n",
"from nltk.corpus import stopwords\n",
"nltk.download('stopwords')\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"def remove_stopwords(tweet):\n",
" text_tokens = word_tokenize(tweet)\n",
" tweet = ' '.join([w for w in text_tokens if not w in stopwords.words()])\n",
" return tweet"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "kCNmdEXM-PvK",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 36
},
"outputId": "93755624-1cd0-4b38-810b-3af6d4b9cbb7"
},
"source": [
"clean_tweets = []\n",
"for tweet in tqdm(X):\n",
" clean_tweets.append(clean_tweet(tweet))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"100%|██████████| 1996/1996 [12:41<00:00, 2.62it/s]\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "1PA1_UA5Vfbx",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 36
},
"outputId": "45fb8a6a-2c81-4f8d-cd4c-ee31d3ed58ec"
},
"source": [
"X = []\n",
"for tweet in tqdm(clean_tweets):\n",
" X.append(remove_stopwords(tweet))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"100%|██████████| 1996/1996 [02:02<00:00, 16.28it/s]\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "NAXkf2vwgdrK",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 412
},
"outputId": "df329b46-7349-4c0a-d41f-b35242a9eab9"
},
"source": [
"for i in range (0,20):\n",
" print(X[i])"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"rip free hug guy lockdown co stay home corona crisis coronavirus\n",
"upside lockdown fo mo quarantine coronavirus pandemic lockdown\n",
"get double dipping virus co quar life quarantine corona crisis coronavirus\n",
"remember home remember happy time co quarantine activity\n",
"play gaze co lockdown\n",
"pandemic get bad even video game send day worth peace shield virtual self quarantine co coronavirus pandemic\n",
"warning outbreak desperate celebrity advantage co lockdown may appear radio tv online demand resuscitation dead career please warn public spot corona crisis coronavirus\n",
"second full day quarantine already lose track day fully impression weekend past day covidcovidquaratine chill quar life\n",
"world health organization determine rt actor give co officially let quarantine essentially mean let know taha\n",
"normal daily life style call quarantine good quarantine year quarantine co corona stop krona\n",
"reach please remember rewind internet return stay home save life stay home stay safe video coronavirus lockdown homestay\n",
"give social gathering lend co quarantine life\n",
"cold case story uplifting hour day know coronavirus pandemic quarantine rough\n",
"due coronavirus pandemic quarantine recommend blow homie goodnight kiss foot away\n",
"know need hear fake eyelash get hand look freakish natural use quarantine let fallout permanently quarantine coronavirus\n",
"go need get pickle go survive quarantine co\n",
"anyone else already kitchen onslaught bore hungry teenager consider fridge freezer lockdown co stay home\n",
"quarantine times right would invite round swift ie party kane west party coronavirus taylor tell truth\n",
"weird force quarantine parent use spending much time commit murder co coronavirus\n",
"quarantine even get stress quar life stay home co lockdown\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "pVfjn71eldZu",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 746
},
"outputId": "afb82ffd-5bc9-4b1b-9b60-1b0ca027e3c3"
},
"source": [
"tweets_df.head(20)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Label</th>\n",
" <th>Text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>115</td>\n",
" <td>joke</td>\n",
" <td>RIP FREE HUGS GUY. #lockdown #Covid_19 #StayHome #CoronaCrisis #coronavirus</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>760</td>\n",
" <td>joke</td>\n",
" <td>The upside of lockdown: NO FOMO. #Quarantine #CoronavirusPandemic #lockdown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>40</td>\n",
" <td>joke</td>\n",
" <td>This is what you get for double-dipping #virus #covid19 #QuaratineLife #Quarantined #CoronaCrisis #coronavirus</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>781</td>\n",
" <td>joke</td>\n",
" <td>Just remember, while we're all home our dogs will remember this as the happy time. #COVID19 #QuarantineActivities</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>975</td>\n",
" <td>joke</td>\n",
" <td>@14wardi @FreshAz6 Just playin' with my ding a ling ... #COVID19 #lockdown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>572</td>\n",
" <td>joke</td>\n",
" <td>\"This pandemic has gotten so bad that even my video game sent me 14days worth of \"\"peace shields\"\" so my virtual self can be quarantined #Covid_19 #CoronavirusPandemic\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>13</td>\n",
" <td>joke</td>\n",
" <td>WARNING There is an outbreak of desperate celebrities taking advantage of the #Covid_19 lockdown. They may appear on radio, TV or online demanding resuscitation of their dead career. Please warn the public if you spot one. #CoronaCrisis #coronavirus</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>985</td>\n",
" <td>joke</td>\n",
" <td>it’s my second full day of quarantine and i’ve already lost track of the days. i was fully under the impression it has been the weekend for the past 3 days. #COVID19 #Covid_19 #QuaratineAndChill #QuaratineLife</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>569</td>\n",
" <td>joke</td>\n",
" <td>The #WorldHealthOrganization determined dogs can't conrtact, or give, #Covid_19 They officially let dogs out of Quarantine. So does this essentially means: W H O let the dogs out? We now know. #bahamen</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>681</td>\n",
" <td>joke</td>\n",
" <td>So normal daily life style is called QUARANTINE , Good! 🌼🤗 QUARANTINE YEAR 25🌼🤗 #Quarantine #വീട്ടിലിരിമൈരേ #Covid_19 #CoronaStopKaroNa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>32</td>\n",
" <td>joke</td>\n",
" <td>You have reached the end of @netflix - please remember to rewind the Internet before you return it. #StayHomeSaveLives #StayHomeStaySafe #COVID19 #Covid_19 #coronavirus #Lockdown #Homestay https://t.co/8aN3ycksCF</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>191</td>\n",
" <td>joke</td>\n",
" <td>I am giving up social gatherings for lent .... #COVID19 #QuarantineLife</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>820</td>\n",
" <td>joke</td>\n",
" <td>When an @ABC2020 #coldcase story is an uplifting hour of your day, you know the #CoronavirusPandemic quarantine has been rough.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>230</td>\n",
" <td>joke</td>\n",
" <td>Due to #CoronavirusPandemic and #Quarantine it's now recommended that you blow your homies goodnight kisses from 6 feet away</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>524</td>\n",
" <td>joke</td>\n",
" <td>I don’t know who needs to hear this but your fake eyelashes have gotten out of hand and look freakish, not natural. Use this quarantine to let them fall out....permanently. #Quarantine #Coronavirus</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>348</td>\n",
" <td>joke</td>\n",
" <td>I’m going to need to get some pickles if I am going to survive this quarantine #Covid_19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>656</td>\n",
" <td>joke</td>\n",
" <td>Anyone else already facing a kitchen onslaught from bored, hungry teenagers and considering a fridge-freezer lockdown? #Covid_19 #StayAtHome</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>549</td>\n",
" <td>joke</td>\n",
" <td>If we weren’t in quarantine times right now, I’d be inviting y’all round for a swiftie party #KanyeWestIsOverParty #coronavirus #TaylorToldTheTruth</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>551</td>\n",
" <td>joke</td>\n",
" <td>Isn’t it weird when you’re forced to quarantine with your parents and you aren’t used to spending that much time with him and now you want to commit murder. #COVID19 #coronavirus</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>308</td>\n",
" <td>joke</td>\n",
" <td>Quarantining even got my dog stressed. #QuaratineLife #StayAtHome #Covid_19 #lockdown https://t.co/dYRaEWBHLk</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID ... Text\n",
"0 115 ... RIP FREE HUGS GUY. #lockdown #Covid_19 #StayHome #CoronaCrisis #coronavirus\n",
"1 760 ... The upside of lockdown: NO FOMO. #Quarantine #CoronavirusPandemic #lockdown\n",
"2 40 ... This is what you get for double-dipping #virus #covid19 #QuaratineLife #Quarantined #CoronaCrisis #coronavirus\n",
"3 781 ... Just remember, while we're all home our dogs will remember this as the happy time. #COVID19 #QuarantineActivities\n",
"4 975 ... @14wardi @FreshAz6 Just playin' with my ding a ling ... #COVID19 #lockdown\n",
"5 572 ... \"This pandemic has gotten so bad that even my video game sent me 14days worth of \"\"peace shields\"\" so my virtual self can be quarantined #Covid_19 #CoronavirusPandemic\"\n",
"6 13 ... WARNING There is an outbreak of desperate celebrities taking advantage of the #Covid_19 lockdown. They may appear on radio, TV or online demanding resuscitation of their dead career. Please warn the public if you spot one. #CoronaCrisis #coronavirus\n",
"7 985 ... it’s my second full day of quarantine and i’ve already lost track of the days. i was fully under the impression it has been the weekend for the past 3 days. #COVID19 #Covid_19 #QuaratineAndChill #QuaratineLife\n",
"8 569 ... The #WorldHealthOrganization determined dogs can't conrtact, or give, #Covid_19 They officially let dogs out of Quarantine. So does this essentially means: W H O let the dogs out? We now know. #bahamen\n",
"9 681 ... So normal daily life style is called QUARANTINE , Good! 🌼🤗 QUARANTINE YEAR 25🌼🤗 #Quarantine #വീട്ടിലിരിമൈരേ #Covid_19 #CoronaStopKaroNa\n",
"10 32 ... You have reached the end of @netflix - please remember to rewind the Internet before you return it. #StayHomeSaveLives #StayHomeStaySafe #COVID19 #Covid_19 #coronavirus #Lockdown #Homestay https://t.co/8aN3ycksCF\n",
"11 191 ... I am giving up social gatherings for lent .... #COVID19 #QuarantineLife\n",
"12 820 ... When an @ABC2020 #coldcase story is an uplifting hour of your day, you know the #CoronavirusPandemic quarantine has been rough.\n",
"13 230 ... Due to #CoronavirusPandemic and #Quarantine it's now recommended that you blow your homies goodnight kisses from 6 feet away\n",
"14 524 ... I don’t know who needs to hear this but your fake eyelashes have gotten out of hand and look freakish, not natural. Use this quarantine to let them fall out....permanently. #Quarantine #Coronavirus\n",
"15 348 ... I’m going to need to get some pickles if I am going to survive this quarantine #Covid_19\n",
"16 656 ... Anyone else already facing a kitchen onslaught from bored, hungry teenagers and considering a fridge-freezer lockdown? #Covid_19 #StayAtHome\n",
"17 549 ... If we weren’t in quarantine times right now, I’d be inviting y’all round for a swiftie party #KanyeWestIsOverParty #coronavirus #TaylorToldTheTruth\n",
"18 551 ... Isn’t it weird when you’re forced to quarantine with your parents and you aren’t used to spending that much time with him and now you want to commit murder. #COVID19 #coronavirus\n",
"19 308 ... Quarantining even got my dog stressed. #QuaratineLife #StayAtHome #Covid_19 #lockdown https://t.co/dYRaEWBHLk\n",
"\n",
"[20 rows x 3 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 20
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "l8pjjXSkKM05"
},
"source": [
"tweet_data_df = pd.DataFrame(X, columns=['text'])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bELCdhyUK6Op",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"outputId": "060a2b22-b8f1-4d72-e037-82684a52916b"
},
"source": [
"tweet_data_df.text.values"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array(['rip free hug guy lockdown co stay home corona crisis coronavirus',\n",
" 'upside lockdown fo mo quarantine coronavirus pandemic lockdown',\n",
" 'get double dipping virus co quar life quarantine corona crisis coronavirus',\n",
" ...,\n",
" 'co muslim youth run nationwide campaign help neighbour self isolate due lockdown amp ongoing coronavirus australia pandemic please see attached picture detail let us get together',\n",
" 'know virus corona stop krona lockdown italy socially responsible pakistani stay safe stay home coronavirus pakistan coronavirus pandemic coronavirus corn virus corona outbreak corona pandemic',\n",
" 'request listen carefully coronavirus india corona corona chains coronavirus outbreak india coronavirus pakistan corona india lockdown pakistan kawika corona row virus santa curfew march co india co corona fighter'],\n",
" dtype=object)"
]
},
"metadata": {
"tags": []
},
"execution_count": 22
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "dUAU3mPfIjoO",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 36
},
"outputId": "c86eed74-223a-45e4-81cd-67146d1adad1"
},
"source": [
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
"from sklearn.cluster import KMeans\n",
"\n",
"random_state = 0\n",
"\n",
"vec = TfidfVectorizer()\n",
"vec.fit(tweet_data_df.text.values)\n",
"features = vec.transform(tweet_data_df.text.values)\n",
"\n",
"cls = KMeans(n_clusters=7, random_state=random_state)\n",
"cls.fit(features)\n",
"\n",
"cls.predict(features)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([2, 0, 2, ..., 0, 2, 3], dtype=int32)"
]
},
"metadata": {
"tags": []
},
"execution_count": 23
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "lswoQChzNQVO",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 36
},
"outputId": "41ed9355-9f2d-4fd8-fe14-80f015d22dee"
},
"source": [
"cls.labels_"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([2, 0, 2, ..., 0, 2, 3], dtype=int32)"
]
},
"metadata": {
"tags": []
},
"execution_count": 24
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "27IH452-Nc-W"
},
"source": [
"from sklearn.decomposition import PCA\n",
"import matplotlib.pyplot as plt\n",
"\n",
"%matplotlib inline\n",
"\n",
"pca = PCA(n_components=2, random_state=random_state)\n",
"reduced_features = pca.fit_transform(features.toarray())\n",
"\n",
"reduced_cluster_centers = pca.fit_transform(cls.cluster_centers_)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "SME-xP50OZWa",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 284
},
"outputId": "465eb7d9-c790-4ebf-8946-593420d5b85e"
},
"source": [
"plt.scatter(reduced_features[:,0], reduced_features[:,1], c=cls.predict(features))\n",
"plt.scatter(reduced_cluster_centers[:, 0], reduced_cluster_centers[:,1], marker='x', s=150, c='b')"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<matplotlib.collections.PathCollection at 0x7fec78974780>"
]
},
"metadata": {
"tags": []
},
"execution_count": 27
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "X0fir1IIOaOL"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment