Skip to content

Instantly share code, notes, and snippets.

@PBPatil
Created June 15, 2018 10:26
Show Gist options
  • Save PBPatil/2112a80c69437c47918d025805dd3bef to your computer and use it in GitHub Desktop.
Save PBPatil/2112a80c69437c47918d025805dd3bef to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Preliminaries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import re\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data Reading "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"tweets_data_path = 'twitter_data.txt'\n",
"\n",
"tweets_data = []\n",
"tweets_file = open(tweets_data_path, \"r\")\n",
"for line in tweets_file:\n",
" try:\n",
" tweet = json.loads(line)\n",
" tweets_data.append(tweet)\n",
" except:\n",
" continue"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total Tweets Count:79359\n"
]
}
],
"source": [
"print ('Total Tweets Count:{}'.format(len(tweets_data))) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Structuring the raw data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"tweets = pd.DataFrame()\n",
"tweets['text'] = map(lambda tweet:tweet['text'] if 'text' in tweet else ' ', tweets_data)\n",
"tweets['lang'] = map(lambda tweet: tweet.get('lang', None), tweets_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- Drawing insights"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"tweets_by_lang = tweets['lang'].value_counts()\n",
"\n",
"fig, ax = plt.subplots()\n",
"ax.tick_params(axis='x', labelsize=15)\n",
"ax.tick_params(axis='y', labelsize=10)\n",
"ax.set_xlabel('Languages', fontsize=15)\n",
"ax.set_ylabel('Number of tweets' , fontsize=15)\n",
"ax.set_title('Top 5 languages', fontsize=15, fontweight='bold')\n",
"tweets_by_lang[:5].plot(ax=ax, kind='bar', color='orange');\n",
"plt.savefig('top_5_langs.jpg',bbox_inches='tight', pad_inches=0.3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Mining Tweets based on Keywords"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def word_in_text(word, text):\n",
" word = word.lower()\n",
" text = text.lower()\n",
" match = re.search(word, text)\n",
" if match:\n",
" return True\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"tweets['FIFA'] = tweets['text'].apply(lambda tweet: word_in_text('#FIFA', tweet))\n",
"tweets['WorldCup'] = tweets['text'].apply(lambda tweet: word_in_text('#WorldCup', tweet))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"keywords = ['FIFA','WorldCup']\n",
"tweets_by_keywords = [tweets['FIFA'].value_counts()[True],tweets['WorldCup'].value_counts()[True]]\n",
"x = list(range(len(keywords)))\n",
"width = 0.8\n",
"fig, ax = plt.subplots()\n",
"plt.bar(x, tweets_by_keywords, width, alpha=1, color='g')\n",
"\n",
"# Setting axis labels and ticks\n",
"ax.set_ylabel('Number of tweets', fontsize=15)\n",
"ax.set_title('Feed', fontsize=10, fontweight='bold')\n",
"ax.set_xticks([p + 0.4 * width for p in x])\n",
"ax.set_xticklabels(keywords, rotation=90)\n",
"plt.grid()\n",
"plt.tight_layout()\n",
"plt.savefig('selected_keywords.jpg',bbox_inches='tight', pad_inches=0.3)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tweets with keyword FIFA: 835\n",
"Tweets with keyword WorldCup: 9818\n"
]
}
],
"source": [
"#Counting Tweets\n",
"print ('Tweets with keyword FIFA: {}'.format(tweets['FIFA'].value_counts()[True]))\n",
"print ('Tweets with keyword WorldCup: {}'.format(tweets['WorldCup'].value_counts()[True]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"__Note__: \n",
"Keywords used are:\n",
"- FIFA,World,Cup,football,FIFA World Cup,#FIFA2018,WorldCup,#WorldCup2018,#FifaWorldCup,#FIFAWorldCup,RUSKSA,#RUSKSA,prediction,win,#FIFA.\n",
"- Hence the total count is 79K+ .For simlplicity sake I have taken two keywords here."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Targeted Selection"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#Selecting HashTags of corrosponding teams from the fixture\n",
"tweets['#RUS'] = tweets['text'].apply(lambda tweet: word_in_text('#RUS', tweet))\n",
"tweets['#KSA'] = tweets['text'].apply(lambda tweet: word_in_text('#KSA', tweet))\n",
"tweets['Neutral'] = tweets['text'].apply(lambda tweet: word_in_text('#RUS', tweet) or word_in_text('#KSA', tweet) or word_in_text('#RUSKSA', tweet))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"keywords= ['#RUS','#KSA','#RUSKSA']\n",
"tweets_by_keywords = [tweets[tweets['Neutral'] == True]['#RUS'].value_counts()[True], \n",
" tweets[tweets['Neutral'] == True]['#KSA'].value_counts()[True], \n",
" tweets[tweets['Neutral'] == True]['Neutral'].value_counts()[True]]\n",
"x = list(range(len(keywords)))\n",
"width = 0.8\n",
"fig, ax = plt.subplots()\n",
"plt.bar(x, tweets_by_keywords, width,alpha=1,color='r')\n",
"ax.set_ylabel('Number of tweets', fontsize=15)\n",
"ax.set_title('Support: Russia vs.Saudi Arabia vs. #RUSKSA ', fontsize=10, fontweight='bold')\n",
"ax.set_xticks([p + 0.4 * width for p in x])\n",
"ax.set_xticklabels(keywords)\n",
"plt.grid()\n",
"plt.savefig('Twitter_feed.jpg',bbox_inches='tight', pad_inches=0.3)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Russian Supporters:7796\n",
"Saudi Supporters:4527\n",
"Neutral:8488\n"
]
}
],
"source": [
"#Counting Tweets\n",
"print ('Russian Supporters:{}'.format(tweets['#RUS'].value_counts()[True]))\n",
"print ('Saudi Supporters:{}'.format(tweets['#KSA'].value_counts()[True]))\n",
"print ('Neutral:{}'.format(tweets['Neutral'].value_counts()[True]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment