Skip to content

Instantly share code, notes, and snippets.

@invegat
Last active February 14, 2020 17:29
Show Gist options
  • Save invegat/bb437d222f9d8eed2b07fa4f10247bfc to your computer and use it in GitHub Desktop.
Save invegat/bb437d222f9d8eed2b07fa4f10247bfc to your computer and use it in GitHub Desktop.
Twitter NLP Classifiers.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Twitter NLP Classifiers.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
"autoclose": false,
"autocomplete": true,
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 1,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
},
"labels_anchors": false,
"latex_user_defs": false,
"report_style_numbering": false,
"user_envs_cfg": false
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/invegat/bb437d222f9d8eed2b07fa4f10247bfc/twitter-nlp-classifiers.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "p84xDZIlsQ9J",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 66
},
"outputId": "aae3e04b-1a0b-4c5a-8998-138116d29c29"
},
"source": [
"import nltk\n",
"nltk.download('stopwords')"
],
"execution_count": 29,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {
"tags": []
},
"execution_count": 29
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Uf6JRflWq1cz",
"colab_type": "code",
"colab": {}
},
"source": [
"import re\n",
"import io\n",
"import pandas as pd\n",
"import requests\n",
"import gensim\n",
"from gensim.models.word2vec import Word2Vec\n",
"from nltk.tokenize import word_tokenize\n",
"import string\n",
"from nltk.corpus import stopwords\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from xgboost import XGBClassifier\n",
"from sklearn.metrics import accuracy_score, roc_auc_score\n",
"from sklearn.linear_model import LogisticRegression"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:50:36.169063Z",
"start_time": "2019-03-29T17:50:36.165101Z"
},
"id": "Fg-goHQyqOuj",
"colab_type": "code",
"colab": {}
},
"source": [
"url = \"https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv\""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:50:39.124431Z",
"start_time": "2019-03-29T17:50:36.698578Z"
},
"id": "TZ_jEGwNqOuo",
"colab_type": "code",
"colab": {}
},
"source": [
"text = requests.get(url).text"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:50:42.867912Z",
"start_time": "2019-03-29T17:50:39.129493Z"
},
"id": "bvcSi4PuqOus",
"colab_type": "code",
"colab": {}
},
"source": [
"df = pd.read_csv(url)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:50:45.513536Z",
"start_time": "2019-03-29T17:50:42.869797Z"
},
"id": "gPNbgehNqOux",
"colab_type": "code",
"outputId": "03a945d3-d2cb-4b02-8808-2adfad990467",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 214
}
},
"source": [
"stop_words = set(stopwords.words('english'))\n",
"\n",
"# turn a doc into clean tokens\n",
"def clean_doc(doc):\n",
"\t# split into tokens by white space\n",
"\ttokens = doc.split()\n",
"\t# remove punctuation from each token\n",
"\ttable = str.maketrans('', '', string.punctuation)\n",
"\ttokens = [w.translate(table) for w in tokens]\n",
"\t# remove remaining tokens that are not alphabetic\n",
"\ttokens = [word for word in tokens if word.isalpha()]\n",
"\t# filter out stop words\n",
"# \tstop_words = set(stopwords.words('english'))\n",
"\ttokens = [w for w in tokens if not w in stop_words]\n",
"\t# filter out short tokens\n",
"\ttokens = [word.lower() for word in tokens if len(word) > 1]\n",
"\treturn tokens\n",
"\n",
"def clean_sentence(doc):\n",
"\t# split into tokens by white space\n",
"\ttokens = doc.split()\n",
"\t# remove punctuation from each token\n",
"\ttable = str.maketrans('', '', string.punctuation)\n",
"\ttokens = [w.translate(table) for w in tokens]\n",
"\t# remove remaining tokens that are not alphabetic\n",
"\ttokens = [word for word in tokens if word.isalpha()]\n",
"\t# filter out stop words\n",
"# \tstop_words = set(stopwords.words('english'))\n",
"\ttokens = [w for w in tokens if not w in stop_words]\n",
"\t# filter out short tokens\n",
"\ttokens = [word.lower() for word in tokens if len(word) > 1]\n",
"\treturn \" \".join(tokens)\n",
"\n",
"df['cleaned'] = df.SentimentText.apply(clean_doc)\n",
"df['cleanedSentence'] = df.SentimentText.apply(clean_sentence)\n",
"print(df.shape)\n",
"df.head()"
],
"execution_count": 34,
"outputs": [
{
"output_type": "stream",
"text": [
"(99989, 4)\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Sentiment</th>\n",
" <th>SentimentText</th>\n",
" <th>cleaned</th>\n",
" <th>cleanedSentence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>is so sad for my APL frie...</td>\n",
" <td>[sad, apl, friend]</td>\n",
" <td>sad apl friend</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>I missed the New Moon trail...</td>\n",
" <td>[missed, new, moon, trailer]</td>\n",
" <td>missed new moon trailer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>omg its already 7:30 :O</td>\n",
" <td>[omg, already]</td>\n",
" <td>omg already</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>.. Omgaga. Im sooo im gunna CRy. I'...</td>\n",
" <td>[omgaga, im, sooo, im, gunna, cry, ive, dentis...</td>\n",
" <td>omgaga im sooo im gunna cry ive dentist since ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>i think mi bf is cheating on me!!! ...</td>\n",
" <td>[think, mi, bf, cheating, tt]</td>\n",
" <td>think mi bf cheating tt</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Sentiment ... cleanedSentence\n",
"0 0 ... sad apl friend\n",
"1 0 ... missed new moon trailer\n",
"2 1 ... omg already\n",
"3 0 ... omgaga im sooo im gunna cry ive dentist since ...\n",
"4 0 ... think mi bf cheating tt\n",
"\n",
"[5 rows x 4 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 34
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:50:46.634914Z",
"start_time": "2019-03-29T17:50:45.516386Z"
},
"id": "e1t8UfPHqOu2",
"colab_type": "code",
"outputId": "f5cdd776-284c-431c-c5d0-ed083abead75",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 133
}
},
"source": [
"vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,1), stop_words='english')\n",
"vectorizer.fit(df.cleanedSentence)"
],
"execution_count": 35,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.float64'>, encoding='utf-8',\n",
" input='content', lowercase=True, max_df=1.0, max_features=10000,\n",
" min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,\n",
" smooth_idf=True, stop_words='english', strip_accents=None,\n",
" sublinear_tf=False, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, use_idf=True, vocabulary=None)"
]
},
"metadata": {
"tags": []
},
"execution_count": 35
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:50:54.165799Z",
"start_time": "2019-03-29T17:50:54.162824Z"
},
"id": "urSxxHj2qOu9",
"colab_type": "code",
"colab": {}
},
"source": [
"X_train = df.cleanedSentence"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:50:57.723419Z",
"start_time": "2019-03-29T17:50:55.005352Z"
},
"id": "PCvseHfyqOvA",
"colab_type": "code",
"outputId": "2f8fc629-e5e8-4ce1-8a51-57fc22e033d2",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 263
}
},
"source": [
"train_word_counts = vectorizer.transform(X_train)\n",
"X_train_vectorized = pd.DataFrame(train_word_counts[0:10000].toarray(), columns=vectorizer.get_feature_names())\n",
"X_train_vectorized = X_train_vectorized.fillna(0)\n",
"print(X_train_vectorized.shape)\n",
"X_train_vectorized.head()"
],
"execution_count": 37,
"outputs": [
{
"output_type": "stream",
"text": [
"(10000, 10000)\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>aa</th>\n",
" <th>aaa</th>\n",
" <th>aaah</th>\n",
" <th>aaahh</th>\n",
" <th>aafreen</th>\n",
" <th>aah</th>\n",
" <th>aahhh</th>\n",
" <th>aalaap</th>\n",
" <th>aamyhaanson</th>\n",
" <th>aaron</th>\n",
" <th>aaronfuller</th>\n",
" <th>aaronob</th>\n",
" <th>aaronrenfree</th>\n",
" <th>aaronrgillespie</th>\n",
" <th>aaw</th>\n",
" <th>aaww</th>\n",
" <th>aawww</th>\n",
" <th>ab</th>\n",
" <th>abandoned</th>\n",
" <th>abbaks</th>\n",
" <th>abbey</th>\n",
" <th>abbiefletcher</th>\n",
" <th>abbsound</th>\n",
" <th>abby</th>\n",
" <th>abbybradz</th>\n",
" <th>abbyharenberg</th>\n",
" <th>abbyyyy</th>\n",
" <th>abc</th>\n",
" <th>abcdefglynis</th>\n",
" <th>abcmsaj</th>\n",
" <th>abduzeedo</th>\n",
" <th>abeeliever</th>\n",
" <th>aber</th>\n",
" <th>abiban</th>\n",
" <th>abideedles</th>\n",
" <th>abiface</th>\n",
" <th>abigaelettuce</th>\n",
" <th>abigaill</th>\n",
" <th>ability</th>\n",
" <th>abirtmo</th>\n",
" <th>...</th>\n",
" <th>youu</th>\n",
" <th>youuu</th>\n",
" <th>youuuu</th>\n",
" <th>youuuuu</th>\n",
" <th>youve</th>\n",
" <th>youyou</th>\n",
" <th>yr</th>\n",
" <th>yrs</th>\n",
" <th>yt</th>\n",
" <th>yu</th>\n",
" <th>yuck</th>\n",
" <th>yucky</th>\n",
" <th>yuh</th>\n",
" <th>yuk</th>\n",
" <th>yum</th>\n",
" <th>yumm</th>\n",
" <th>yummm</th>\n",
" <th>yummmy</th>\n",
" <th>yummy</th>\n",
" <th>yun</th>\n",
" <th>yung</th>\n",
" <th>yup</th>\n",
" <th>yupp</th>\n",
" <th>yur</th>\n",
" <th>yw</th>\n",
" <th>zac</th>\n",
" <th>zach</th>\n",
" <th>zack</th>\n",
" <th>ze</th>\n",
" <th>zealand</th>\n",
" <th>zero</th>\n",
" <th>zip</th>\n",
" <th>zombie</th>\n",
" <th>zombies</th>\n",
" <th>zomg</th>\n",
" <th>zone</th>\n",
" <th>zones</th>\n",
" <th>zoo</th>\n",
" <th>zoom</th>\n",
" <th>zune</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 10000 columns</p>\n",
"</div>"
],
"text/plain": [
" aa aaa aaah aaahh aafreen aah ... zomg zone zones zoo zoom zune\n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0\n",
"1 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0\n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0\n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0\n",
"4 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0\n",
"\n",
"[5 rows x 10000 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 37
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:51:09.809785Z",
"start_time": "2019-03-29T17:51:09.761256Z"
},
"colab_type": "code",
"id": "TX8OEgUP_3ee",
"colab": {}
},
"source": [
"dfs = df.sample(frac=0.1)\n",
"X = dfs.cleanedSentence\n",
"y = dfs.Sentiment.values\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:51:11.508193Z",
"start_time": "2019-03-29T17:51:11.504530Z"
},
"id": "h4YsonNhqOvL",
"colab_type": "code",
"outputId": "59a68def-cac0-4e16-9c30-403e7f65db1b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 83
}
},
"source": [
"print(X_train.shape)\n",
"print(X_test.shape)\n",
"print(y_train.shape)\n",
"print(y_test.shape)"
],
"execution_count": 39,
"outputs": [
{
"output_type": "stream",
"text": [
"(7999,)\n",
"(2000,)\n",
"(7999,)\n",
"(2000,)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:51:16.482859Z",
"start_time": "2019-03-29T17:51:16.361378Z"
},
"id": "AlvWW5wRqOvP",
"colab_type": "code",
"outputId": "a1b2d96c-4f69-43f0-8f36-a4d8f8dcf099",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 116
}
},
"source": [
"vectorizer = CountVectorizer(max_features=1000, ngram_range=(1,1), stop_words='english')\n",
"vectorizer.fit(X_train)"
],
"execution_count": 40,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=1000, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words='english',\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)"
]
},
"metadata": {
"tags": []
},
"execution_count": 40
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:51:18.167707Z",
"start_time": "2019-03-29T17:51:18.059599Z"
},
"id": "MkQUL4jzqOvW",
"colab_type": "code",
"outputId": "f1146396-5ef0-4133-c5f8-84df0bfd6aad",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 50
}
},
"source": [
"train_word_counts = vectorizer.transform(X_train)\n",
"X_train_vectorized = pd.DataFrame(train_word_counts.toarray(), columns=vectorizer.get_feature_names())\n",
"\n",
"test_word_counts = vectorizer.transform(X_test)\n",
"X_test_vectorized = pd.DataFrame(test_word_counts.toarray(), columns=vectorizer.get_feature_names())\n",
"print(X_train_vectorized.shape)\n",
"print(X_test_vectorized.shape)"
],
"execution_count": 41,
"outputs": [
{
"output_type": "stream",
"text": [
"(7999, 1000)\n",
"(2000, 1000)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:51:21.222904Z",
"start_time": "2019-03-29T17:51:21.218619Z"
},
"id": "jIUocZm5qOva",
"colab_type": "code",
"outputId": "d9377399-8851-4a32-c45d-e691a01ded47",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 33
}
},
"source": [
"df.Sentiment.unique()"
],
"execution_count": 42,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([0, 1])"
]
},
"metadata": {
"tags": []
},
"execution_count": 42
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:48:51.962715Z",
"start_time": "2019-03-29T17:48:31.500744Z"
},
"id": "acEs8nlnqOve",
"colab_type": "code",
"outputId": "c05aa698-dfcf-4a65-a877-8b30f244d7a1",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 83
}
},
"source": [
"XGB = XGBClassifier(n_estimators=200, objective=\"binary:logistic\").fit(X_train_vectorized, y_train)\n",
"train_predictions = XGB.predict(X_train_vectorized)\n",
"test_predictions = XGB.predict(X_test_vectorized)\n",
"print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')\n",
"print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')\n",
"print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')\n",
"print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')"
],
"execution_count": 43,
"outputs": [
{
"output_type": "stream",
"text": [
"Train Accuracy: 0.700962620327541\n",
"Test Accuracy: 0.6615\n",
"Train Roc Auc: 0.6712842399415706\n",
"Test Roc Auc: 0.6423723426511531\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:49:33.866072Z",
"start_time": "2019-03-29T17:48:53.730994Z"
},
"id": "5JIC_D9hqOvi",
"colab_type": "code",
"outputId": "9efa2958-961a-439e-e5b5-48e76e0a3d2a",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 83
}
},
"source": [
"RFC = RandomForestClassifier(n_estimators=200).fit(X_train_vectorized, y_train)\n",
"\n",
"train_predictions = RFC.predict(X_train_vectorized)\n",
"test_predictions = RFC.predict(X_test_vectorized)\n",
"\n",
"print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')\n",
"print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')\n",
"print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')\n",
"print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')"
],
"execution_count": 44,
"outputs": [
{
"output_type": "stream",
"text": [
"Train Accuracy: 0.9577447180897612\n",
"Test Accuracy: 0.6755\n",
"Train Roc Auc: 0.9540209266139531\n",
"Test Roc Auc: 0.670136709634851\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-29T17:49:38.271311Z",
"start_time": "2019-03-29T17:49:36.806563Z"
},
"id": "KWeYS1RuqOvo",
"colab_type": "code",
"outputId": "c08a65cf-487c-4c26-8f0d-168b6d223d58",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 83
}
},
"source": [
"LR = LogisticRegression(random_state=42, solver=\"newton-cg\").fit(X_train_vectorized, y_train)\n",
"\n",
"train_predictions = LR.predict(X_train_vectorized)\n",
"test_predictions = LR.predict(X_test_vectorized)\n",
"print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')\n",
"print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')\n",
"print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')\n",
"print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')"
],
"execution_count": 45,
"outputs": [
{
"output_type": "stream",
"text": [
"Train Accuracy: 0.7727215901987748\n",
"Test Accuracy: 0.687\n",
"Train Roc Auc: 0.7609365215458385\n",
"Test Roc Auc: 0.6786840792417\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "6thJvUHBqOwD",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment