Created
May 27, 2019 04:43
-
-
Save ajitesh123/e233c8fb2c758c7e5fe01d5f1d04ebd2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 334, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Tweets False\n", | |
"Length False\n", | |
"ID False\n", | |
"Date False\n", | |
"Source False\n", | |
"Likes False\n", | |
"RTs False\n", | |
"dtype: bool" | |
] | |
}, | |
"execution_count": 334, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#Before proceeding with any data analysis let's analyze whether we have null in dataset.\n", | |
"#We got False, stating that we have no null in DataFrame\n", | |
"data.isnull().any()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 335, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Length</th>\n", | |
" <th>ID</th>\n", | |
" <th>Likes</th>\n", | |
" <th>RTs</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>3212.000000</td>\n", | |
" <td>3.212000e+03</td>\n", | |
" <td>3212.000000</td>\n", | |
" <td>3212.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>126.949253</td>\n", | |
" <td>1.087410e+18</td>\n", | |
" <td>16837.234745</td>\n", | |
" <td>3864.905044</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>23.801556</td>\n", | |
" <td>2.807952e+16</td>\n", | |
" <td>18983.062909</td>\n", | |
" <td>4317.286641</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>33.000000</td>\n", | |
" <td>1.041701e+18</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>41.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>124.000000</td>\n", | |
" <td>1.061404e+18</td>\n", | |
" <td>6950.500000</td>\n", | |
" <td>1748.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>140.000000</td>\n", | |
" <td>1.087907e+18</td>\n", | |
" <td>12294.500000</td>\n", | |
" <td>2971.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>140.000000</td>\n", | |
" <td>1.114145e+18</td>\n", | |
" <td>19991.250000</td>\n", | |
" <td>4570.250000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>148.000000</td>\n", | |
" <td>1.132528e+18</td>\n", | |
" <td>422854.000000</td>\n", | |
" <td>121051.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Length ID Likes RTs\n", | |
"count 3212.000000 3.212000e+03 3212.000000 3212.000000\n", | |
"mean 126.949253 1.087410e+18 16837.234745 3864.905044\n", | |
"std 23.801556 2.807952e+16 18983.062909 4317.286641\n", | |
"min 33.000000 1.041701e+18 0.000000 41.000000\n", | |
"25% 124.000000 1.061404e+18 6950.500000 1748.000000\n", | |
"50% 140.000000 1.087907e+18 12294.500000 2971.000000\n", | |
"75% 140.000000 1.114145e+18 19991.250000 4570.250000\n", | |
"max 148.000000 1.132528e+18 422854.000000 121051.000000" | |
] | |
}, | |
"execution_count": 335, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.describe()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 337, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Data cleaning for Tweet text analysis \n", | |
"#Search for the text beginning with http or @ and then delete the proceeding text until we find whitespace \n", | |
"#Also, remove & and white chracter trailing the last character \n", | |
"data['Tweets_Cln']=data.Tweets.str.replace(r'http\\S+', '').str.replace(r'@\\S+', '').str.replace('&', '').str.rstrip()\n", | |
"\n", | |
"#We can remove the Tweets that are retweets by deleting the tweets that have zero likes. \n", | |
"#In Twitter data, retweets from users have zero likes\n", | |
"\n", | |
"data=data[data.Likes!=0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 338, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Tweets</th>\n", | |
" <th>Length</th>\n", | |
" <th>ID</th>\n", | |
" <th>Date</th>\n", | |
" <th>Source</th>\n", | |
" <th>Likes</th>\n", | |
" <th>RTs</th>\n", | |
" <th>Tweets_Cln</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Called on Vice President Shri @MVenkaiahNaidu ...</td>\n", | |
" <td>88</td>\n", | |
" <td>1132527835812225024</td>\n", | |
" <td>2019-05-26 06:04:12</td>\n", | |
" <td>Twitter Web Client</td>\n", | |
" <td>7845</td>\n", | |
" <td>1027</td>\n", | |
" <td>Called on Vice President Shri Ji.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Thank you President @jairbolsonaro. I look for...</td>\n", | |
" <td>140</td>\n", | |
" <td>1132341572090191872</td>\n", | |
" <td>2019-05-25 17:44:03</td>\n", | |
" <td>Twitter for iPhone</td>\n", | |
" <td>18518</td>\n", | |
" <td>2573</td>\n", | |
" <td>Thank you President I look forward to making ...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Tweets Length \\\n", | |
"0 Called on Vice President Shri @MVenkaiahNaidu ... 88 \n", | |
"1 Thank you President @jairbolsonaro. I look for... 140 \n", | |
"\n", | |
" ID Date Source Likes RTs \\\n", | |
"0 1132527835812225024 2019-05-26 06:04:12 Twitter Web Client 7845 1027 \n", | |
"1 1132341572090191872 2019-05-25 17:44:03 Twitter for iPhone 18518 2573 \n", | |
"\n", | |
" Tweets_Cln \n", | |
"0 Called on Vice President Shri Ji. \n", | |
"1 Thank you President I look forward to making ... " | |
] | |
}, | |
"execution_count": 338, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.head(2)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment