Created
March 19, 2014 19:06
-
-
Save rajat404/9648947 to your computer and use it in GitHub Desktop.
Sentimental Analysis of Tweets through NLTK
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "tweet" | |
}, | |
"name": "tweet", | |
"nbformat": 2, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"source": "<center><h1><u><b>Sentimental Analysis of Tweets through NLTK</b></u></h1></center>" | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "<h2>Authentication</h2>\nWe shall use the access token and API secrets in the file keys.txt" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": "import json\nimport twitter\nauthval = json.load(open(\"keys.txt\"))\nCONSUMER_KEY = authval['CONSUMER_KEY']\nCONSUMER_SECRET = authval['CONSUMER_SECRET']\nOAUTH_TOKEN = authval['OAUTH_TOKEN'] \nOAUTH_TOKEN_SECRET = authval['OAUTH_TOKEN_SECRET']\n\nauth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,\n CONSUMER_KEY, CONSUMER_SECRET)\n\ntwitter_api = twitter.Twitter(auth=auth)", | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "<h2> Searching for tweets</h2>" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": "def twitter_search(twitter_api, q, max_results=10, **kw):\n \n search_results = twitter_api.search.tweets(q=q, count=100, **kw)\n \n statuses = search_results['statuses']\n \n #We have used a small limit due to internet speed concerns\n #Change the limit for practical use\n max_results = min(50, max_results)\n \n for _ in range(10): \n try:\n next_results = search_results['search_metadata']['next_results']\n except KeyError, e: \n break\n \n kwargs = dict([ kv.split('=') \n for kv in next_results[1:].split(\"&\") ])\n \n search_results = twitter_api.search.tweets(**kwargs)\n statuses += search_results['statuses']\n \n if len(statuses) > max_results: \n break\n \n return statuses\n\n ", | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "<h2>Extracting tweet-entities</h2> " | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": "def extract_tweet_entities(statuses):\n \n if len(statuses) == 0:\n return [], [], [], [], []\n \n status_texts = [ status['text'] \n for status in statuses ]\n \n screen_names = [ user_mention['screen_name'] \n for status in statuses\n for user_mention in status['entities']['user_mentions'] ]\n \n words = [ w \n for t in status_texts \n for w in t.split()]\n \n hashtags = [ hashtag['text'] \n for status in statuses \n for hashtag in status['entities']['hashtags'] ]\n\n urls = [ url['expanded_url'] \n for status in statuses \n for url in status['entities']['urls'] ]\n \n symbols = [ symbol['text']\n for status in statuses\n for symbol in status['entities']['symbols'] ]\n \n if status['entities'].has_key('media'): \n media = [ media['url'] \n for status in statuses \n for media in status['entities']['media'] ]\n else:\n media = []\n\n return status_texts, screen_names, words, hashtags, urls, media, symbols", | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "<br><font size=4>For practical reasons we shall store the tweets in a text file 'statuses.txt'.<br>\nSo that we don't have to fetch tweets again and again.</font><br><br>" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "'''\nq = 'AAP' #Mention the Hashtag here\n\nstatuses = twitter_search(twitter_api, q, 50)\njson.dump(statuses, open(\"statuses.txt\",'w'))\n'''", | |
"language": "python", | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": "'\\nq = \\'AAP\\'\\n\\nstatuses = twitter_search(twitter_api, q, 50)\\njson.dump(statuses, open(\"statuses.txt\",\\'w\\'))\\n'" | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "<br><font size=4>We have already procured few tweets, and are retrieving them from the above mentioned text file</font><br><br>" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "json_data=open('statuses.txt')\n\nstatuses = json.load(json_data)\nfinres=extract_tweet_entities(statuses)\nstatus_texts, screen_names, words, hashtags, urls, media, symbols = extract_tweet_entities(statuses)\n \n# View the first five items for each...\nprint \"Total tweets: \" + str(len(status_texts)) + '\\n\\n'\nprint json.dumps(status_texts[0:5], indent=1)\nprint json.dumps(words[0:5], indent=1)\nprint json.dumps(screen_names[0:5], indent=1) \nprint json.dumps(hashtags[0:5], indent=1)\nprint json.dumps(urls[0:5], indent=1)\nprint json.dumps(media[0:5], indent=1)\nprint json.dumps(symbols[0:5], indent=1)", | |
"language": "python", | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Total tweets: 198\n\n\n[\n \"Teacher\\nSUch aur Veham main kiya farak hai?\\nBoy:\\nAap hum ko paRha rahi hain\\nye SUCH hai\\nAur\\nhum paRh rahe hain ye aap ka VEHAM hai...!\", \n \"RT @pragnik: Shocking: #AAP IT admin Ankit Lal pays to trend their promoted party hashtags or anti-BJP on Twitter!\\n\\n #HDL #Namo4PM http://t\\u2026\", \n \"RT @preethidb: Get ready Mumbai, Arvind's on his way! http://t.co/BZlQP2Zf4t\", \n \"Ashok Aggarwal quits AAP, says party functioning like a private limited company http://t.co/oYLVddkskU\\\"\", \n \"AAP sabd itna cheap aur badnaam hua hai ki ab izzat dene k liye tum bolna padta hai\"\n]\n[\n \"Teacher\", \n \"SUch\", \n \"aur\", \n \"Veham\", \n \"main\"\n]\n[\n \"pragnik\", \n \"preethidb\", \n \"ashutosh083B\", \n \"ashutosh83B\", \n \"timesofindia\"\n]\n[\n \"AAP\", \n \"HDL\", \n \"Namo4PM\", \n \"AAP\", \n \"KejriwalLies\"\n]\n[\n \"http://timesofindia.indiatimes.com/india/Ashok-Aggarwal-quits-AAP-says-party-functioning-like-a-private-limited-company/articleshow/31838083.cms?utm_source=twitter.com&utm_medium=referral&utm_campaign=timesofindia\", \n \"http://timesofindia.indiatimes.com/india/Ashok-Aggarwal-quits-AAP-says-party-functioning-like-a-private-limited-company/articleshow/31838083.cms?utm_source=twitter.com&utm_medium=referral&utm_campaign=timesofindia\", \n \"http://fb.me/2h8yDhR3L\", \n \"http://timesofindia.indiatimes.com/city/delhi/BJP-may-field-women-to-trip-AAP/articleshow/31805002.cms\", \n \"http://www.niticentral.com/2014/03/11/lessons-gujarat-villagers-taught-to-arvind-kejriwal-198517.html\"\n]\n[]\n[]" | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "<h2>Analyses of Tweets </h2>" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": "import nltk\nimport nltk.classify\n\nposlines= open(r'small.pos', 'r').read().splitlines()\nneglines= open(r'small.neg', 'r').read().splitlines()\ntrainset= [(x,'positive') for x in poslines] + [(x,-1) for x in neglines]\ntweets = []\n\nfor (words, sentiment) in trainset:\n words_filtered = [e.lower() for e in words.split() if len(e) >= 3] \n tweets.append((words_filtered, sentiment))\n\n\nstop_words = open(r'stop-words.txt', 'r').read()\nstop = []\n\nwords_filtered2 = [e.lower() for e in stop_words.split()]\nstop = words_filtered2\n\n \ndef get_words_in_tweets(tweets):\n all_words = []\n for (words, sentiment) in tweets:\n all_words.extend(words)\n return all_words\n \ndef get_word_features(wordlist):\n wordlist = nltk.FreqDist(wordlist)\n word_features = wordlist.keys()\n return word_features\n\nword_features = get_word_features(get_words_in_tweets(tweets))\n\nw=word_features[:]\n\nfor x in word_features:\n if x in stop:\n w.remove(x)\n\n\ndef extract_features(document):\n document_words = set(document)\n features = {}\n for word in w:\n features['contains(%s)' % word] = (word in document_words)\n return features\n\ntraining_set = nltk.classify.apply_features(extract_features, tweets)\n \nclassifier = nltk.NaiveBayesClassifier.train(training_set)\n\nlop=0 #level of positiveness\nlon=0 #level of negativeness\n\nfor tweet in status_texts:\n sentiment = classifier.classify(extract_features(tweet.split()))\n if sentiment=='positive':\n lop=lop+1\n else:\n lon=lon+1", | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 28 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "<h3>Our Findings:</h3>" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "print '\\n'\nprint \"Level of positiveness: \"+ str(lop/float(len(status_texts)))\nprint \"Level of negativeness: \"+ str(lon/float(len(status_texts)))\n\nif lop>lon:\n print \"\\nThe sentiment for the given hashtag is positive\\n\"\nelse:\n print \"\\nThe sentiment for the given hashtag is negative\\n\" \n ", | |
"language": "python", | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n\nLevel of positiveness: 0.691919191919\nLevel of negativeness: 0.308080808081\n\nThe sentiment for the given hashtag is positive" | |
} | |
], | |
"prompt_number": 26 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": "", | |
"language": "python", | |
"outputs": [] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment