Skip to content

Instantly share code, notes, and snippets.

@ajitesh123
Created May 27, 2019 04:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ajitesh123/e233c8fb2c758c7e5fe01d5f1d04ebd2 to your computer and use it in GitHub Desktop.
Save ajitesh123/e233c8fb2c758c7e5fe01d5f1d04ebd2 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 334,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Tweets False\n",
"Length False\n",
"ID False\n",
"Date False\n",
"Source False\n",
"Likes False\n",
"RTs False\n",
"dtype: bool"
]
},
"execution_count": 334,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Before proceeding with any data analysis let's analyze whether we have null in dataset.\n",
"#We got False, stating that we have no null in DataFrame\n",
"data.isnull().any()"
]
},
{
"cell_type": "code",
"execution_count": 335,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Length</th>\n",
" <th>ID</th>\n",
" <th>Likes</th>\n",
" <th>RTs</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>3212.000000</td>\n",
" <td>3.212000e+03</td>\n",
" <td>3212.000000</td>\n",
" <td>3212.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>126.949253</td>\n",
" <td>1.087410e+18</td>\n",
" <td>16837.234745</td>\n",
" <td>3864.905044</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>23.801556</td>\n",
" <td>2.807952e+16</td>\n",
" <td>18983.062909</td>\n",
" <td>4317.286641</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>33.000000</td>\n",
" <td>1.041701e+18</td>\n",
" <td>0.000000</td>\n",
" <td>41.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>124.000000</td>\n",
" <td>1.061404e+18</td>\n",
" <td>6950.500000</td>\n",
" <td>1748.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>140.000000</td>\n",
" <td>1.087907e+18</td>\n",
" <td>12294.500000</td>\n",
" <td>2971.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>140.000000</td>\n",
" <td>1.114145e+18</td>\n",
" <td>19991.250000</td>\n",
" <td>4570.250000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>148.000000</td>\n",
" <td>1.132528e+18</td>\n",
" <td>422854.000000</td>\n",
" <td>121051.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Length ID Likes RTs\n",
"count 3212.000000 3.212000e+03 3212.000000 3212.000000\n",
"mean 126.949253 1.087410e+18 16837.234745 3864.905044\n",
"std 23.801556 2.807952e+16 18983.062909 4317.286641\n",
"min 33.000000 1.041701e+18 0.000000 41.000000\n",
"25% 124.000000 1.061404e+18 6950.500000 1748.000000\n",
"50% 140.000000 1.087907e+18 12294.500000 2971.000000\n",
"75% 140.000000 1.114145e+18 19991.250000 4570.250000\n",
"max 148.000000 1.132528e+18 422854.000000 121051.000000"
]
},
"execution_count": 335,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 337,
"metadata": {},
"outputs": [],
"source": [
"#Data cleaning for Tweet text analysis \n",
"#Search for the text beginning with http or @ and then delete the proceeding text until we find whitespace \n",
"#Also, remove &amp and white chracter trailing the last character \n",
"data['Tweets_Cln']=data.Tweets.str.replace(r'http\\S+', '').str.replace(r'@\\S+', '').str.replace('&amp', '').str.rstrip()\n",
"\n",
"#We can remove the Tweets that are retweets by deleting the tweets that have zero likes. \n",
"#In Twitter data, retweets from users have zero likes\n",
"\n",
"data=data[data.Likes!=0]"
]
},
{
"cell_type": "code",
"execution_count": 338,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Tweets</th>\n",
" <th>Length</th>\n",
" <th>ID</th>\n",
" <th>Date</th>\n",
" <th>Source</th>\n",
" <th>Likes</th>\n",
" <th>RTs</th>\n",
" <th>Tweets_Cln</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Called on Vice President Shri @MVenkaiahNaidu ...</td>\n",
" <td>88</td>\n",
" <td>1132527835812225024</td>\n",
" <td>2019-05-26 06:04:12</td>\n",
" <td>Twitter Web Client</td>\n",
" <td>7845</td>\n",
" <td>1027</td>\n",
" <td>Called on Vice President Shri Ji.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Thank you President @jairbolsonaro. I look for...</td>\n",
" <td>140</td>\n",
" <td>1132341572090191872</td>\n",
" <td>2019-05-25 17:44:03</td>\n",
" <td>Twitter for iPhone</td>\n",
" <td>18518</td>\n",
" <td>2573</td>\n",
" <td>Thank you President I look forward to making ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Tweets Length \\\n",
"0 Called on Vice President Shri @MVenkaiahNaidu ... 88 \n",
"1 Thank you President @jairbolsonaro. I look for... 140 \n",
"\n",
" ID Date Source Likes RTs \\\n",
"0 1132527835812225024 2019-05-26 06:04:12 Twitter Web Client 7845 1027 \n",
"1 1132341572090191872 2019-05-25 17:44:03 Twitter for iPhone 18518 2573 \n",
"\n",
" Tweets_Cln \n",
"0 Called on Vice President Shri Ji. \n",
"1 Thank you President I look forward to making ... "
]
},
"execution_count": 338,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head(2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment