rajat404/gist:9648947

## gistfile1.txt
{
 "metadata": {
  "name": "tweet"
 },
 "name": "tweet",
 "nbformat": 2,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "source": "<center><h1><u><b>Sentimental Analysis of Tweets through NLTK</b></u></h1></center>"
    },
    {
     "cell_type": "markdown",
     "source": "<h2>Authentication</h2>\nWe shall use the access token and API secrets in the file keys.txt"
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": "import json\nimport twitter\nauthval = json.load(open(\"keys.txt\"))\nCONSUMER_KEY = authval['CONSUMER_KEY']\nCONSUMER_SECRET = authval['CONSUMER_SECRET']\nOAUTH_TOKEN = authval['OAUTH_TOKEN'] \nOAUTH_TOKEN_SECRET = authval['OAUTH_TOKEN_SECRET']\n\nauth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,\n                           CONSUMER_KEY, CONSUMER_SECRET)\n\ntwitter_api = twitter.Twitter(auth=auth)",
     "language": "python",
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "source": "<h2> Searching for tweets</h2>"
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": "def twitter_search(twitter_api, q, max_results=10, **kw):\n  \n    search_results = twitter_api.search.tweets(q=q, count=100, **kw)\n    \n    statuses = search_results['statuses']\n    \n    #We have used a small limit due to internet speed concerns\n    #Change the limit for practical use\n    max_results = min(50, max_results)\n    \n    for _ in range(10): \n        try:\n            next_results = search_results['search_metadata']['next_results']\n        except KeyError, e: \n            break\n            \n        kwargs = dict([ kv.split('=') \n                        for kv in next_results[1:].split(\"&\") ])\n        \n        search_results = twitter_api.search.tweets(**kwargs)\n        statuses += search_results['statuses']\n        \n        if len(statuses) > max_results: \n            break\n            \n    return statuses\n\n     ",
     "language": "python",
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "markdown",
     "source": "<h2>Extracting tweet-entities</h2> "
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": "def extract_tweet_entities(statuses):\n    \n    if len(statuses) == 0:\n        return [], [], [], [], []\n    \n    status_texts = [ status['text'] \n                     for status in statuses ]\n    \n    screen_names = [ user_mention['screen_name'] \n                         for status in statuses\n                            for user_mention in status['entities']['user_mentions'] ]\n    \n    words = [ w     \n              for t in status_texts \n                  for w in t.split()]\n    \n    hashtags = [ hashtag['text'] \n                     for status in statuses \n                        for hashtag in status['entities']['hashtags'] ]\n\n    urls = [ url['expanded_url'] \n                     for status in statuses \n                        for url in status['entities']['urls'] ]\n    \n    symbols = [ symbol['text']\n                   for status in statuses\n                       for symbol in status['entities']['symbols'] ]\n               \n    if status['entities'].has_key('media'): \n        media = [ media['url'] \n                         for status in statuses  \n                            for media in status['entities']['media'] ]\n    else:\n        media = []\n\n    return status_texts, screen_names, words, hashtags, urls, media, symbols",
     "language": "python",
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "source": "<br><font size=4>For practical reasons we shall store the tweets in a text file 'statuses.txt'.<br>\nSo that we don't have to fetch tweets again and again.</font><br><br>"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "'''\nq = 'AAP' #Mention the Hashtag here\n\nstatuses = twitter_search(twitter_api, q, 50)\njson.dump(statuses, open(\"statuses.txt\",'w'))\n'''",
     "language": "python",
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 4,
       "text": "'\\nq = \\'AAP\\'\\n\\nstatuses = twitter_search(twitter_api, q, 50)\\njson.dump(statuses, open(\"statuses.txt\",\\'w\\'))\\n'"
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "markdown",
     "source": "<br><font size=4>We have already procured few tweets, and are retrieving them from the above mentioned text file</font><br><br>"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "json_data=open('statuses.txt')\n\nstatuses = json.load(json_data)\nfinres=extract_tweet_entities(statuses)\nstatus_texts, screen_names, words, hashtags, urls, media, symbols = extract_tweet_entities(statuses)\n    \n# View the first five items for each...\nprint \"Total tweets: \" + str(len(status_texts)) + '\\n\\n'\nprint json.dumps(status_texts[0:5], indent=1)\nprint json.dumps(words[0:5], indent=1)\nprint json.dumps(screen_names[0:5], indent=1) \nprint json.dumps(hashtags[0:5], indent=1)\nprint json.dumps(urls[0:5], indent=1)\nprint json.dumps(media[0:5], indent=1)\nprint json.dumps(symbols[0:5], indent=1)",
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "Total tweets: 198\n\n\n[\n \"Teacher\\nSUch aur Veham main kiya farak hai?\\nBoy:\\nAap hum ko paRha rahi hain\\nye SUCH hai\\nAur\\nhum paRh rahe hain ye aap ka VEHAM hai...!\", \n \"RT @pragnik: Shocking: #AAP IT admin Ankit Lal pays to trend their promoted party hashtags or anti-BJP on Twitter!\\n\\n #HDL #Namo4PM http://t\\u2026\", \n \"RT @preethidb: Get ready Mumbai, Arvind's on his way! http://t.co/BZlQP2Zf4t\", \n \"Ashok Aggarwal quits AAP, says party functioning like a private limited company http://t.co/oYLVddkskU\\\"\", \n \"AAP sabd itna cheap aur badnaam hua hai ki ab izzat dene k liye tum bolna padta hai\"\n]\n[\n \"Teacher\", \n \"SUch\", \n \"aur\", \n \"Veham\", \n \"main\"\n]\n[\n \"pragnik\", \n \"preethidb\", \n \"ashutosh083B\", \n \"ashutosh83B\", \n \"timesofindia\"\n]\n[\n \"AAP\", \n \"HDL\", \n \"Namo4PM\", \n \"AAP\", \n \"KejriwalLies\"\n]\n[\n \"http://timesofindia.indiatimes.com/india/Ashok-Aggarwal-quits-AAP-says-party-functioning-like-a-private-limited-company/articleshow/31838083.cms?utm_source=twitter.com&utm_medium=referral&utm_campaign=timesofindia\", \n \"http://timesofindia.indiatimes.com/india/Ashok-Aggarwal-quits-AAP-says-party-functioning-like-a-private-limited-company/articleshow/31838083.cms?utm_source=twitter.com&utm_medium=referral&utm_campaign=timesofindia\", \n \"http://fb.me/2h8yDhR3L\", \n \"http://timesofindia.indiatimes.com/city/delhi/BJP-may-field-women-to-trip-AAP/articleshow/31805002.cms\", \n \"http://www.niticentral.com/2014/03/11/lessons-gujarat-villagers-taught-to-arvind-kejriwal-198517.html\"\n]\n[]\n[]"
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "source": "<h2>Analyses of Tweets </h2>"
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": "import nltk\nimport nltk.classify\n\nposlines= open(r'small.pos', 'r').read().splitlines()\nneglines= open(r'small.neg', 'r').read().splitlines()\ntrainset= [(x,'positive') for x in poslines] + [(x,-1) for x in neglines]\ntweets = []\n\nfor (words, sentiment) in trainset:\n  words_filtered = [e.lower() for e in words.split() if len(e) >= 3] \n  tweets.append((words_filtered, sentiment))\n\n\nstop_words = open(r'stop-words.txt', 'r').read()\nstop = []\n\nwords_filtered2 = [e.lower() for e in stop_words.split()]\nstop = words_filtered2\n\n      \ndef get_words_in_tweets(tweets):\n    all_words = []\n    for (words, sentiment) in tweets:\n      all_words.extend(words)\n    return all_words\n \ndef get_word_features(wordlist):\n    wordlist = nltk.FreqDist(wordlist)\n    word_features = wordlist.keys()\n    return word_features\n\nword_features = get_word_features(get_words_in_tweets(tweets))\n\nw=word_features[:]\n\nfor x in word_features:\n    if x in stop:\n        w.remove(x)\n\n\ndef extract_features(document):\n    document_words = set(document)\n    features = {}\n    for word in w:\n      features['contains(%s)' % word] = (word in document_words)\n    return features\n\ntraining_set = nltk.classify.apply_features(extract_features, tweets)\n    \nclassifier = nltk.NaiveBayesClassifier.train(training_set)\n\nlop=0       #level of positiveness\nlon=0       #level of negativeness\n\nfor tweet in status_texts:\n    sentiment = classifier.classify(extract_features(tweet.split()))\n    if sentiment=='positive':\n        lop=lop+1\n    else:\n        lon=lon+1",
     "language": "python",
     "outputs": [],
     "prompt_number": 28
    },
    {
     "cell_type": "markdown",
     "source": "<h3>Our Findings:</h3>"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "print '\\n'\nprint \"Level of positiveness: \"+ str(lop/float(len(status_texts)))\nprint \"Level of negativeness: \"+ str(lon/float(len(status_texts)))\n\nif lop>lon:\n    print \"\\nThe sentiment for the given hashtag is positive\\n\"\nelse:\n    print \"\\nThe sentiment for the given hashtag is negative\\n\" \n    ",
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n\nLevel of positiveness: 0.691919191919\nLevel of negativeness: 0.308080808081\n\nThe sentiment for the given hashtag is positive"
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": "",
     "language": "python",
     "outputs": []
    }
   ]
  }
 ]
}
	{
	"metadata": {
	"name": "tweet"
	},
	"name": "tweet",
	"nbformat": 2,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "markdown",
	"source": "<center><h1><u><b>Sentimental Analysis of Tweets through NLTK</b></u></h1></center>"
	},
	{
	"cell_type": "markdown",
	"source": "<h2>Authentication</h2>\nWe shall use the access token and API secrets in the file keys.txt"
	},
	{
	"cell_type": "code",
	"collapsed": true,
	"input": "import json\nimport twitter\nauthval = json.load(open(\"keys.txt\"))\nCONSUMER_KEY = authval['CONSUMER_KEY']\nCONSUMER_SECRET = authval['CONSUMER_SECRET']\nOAUTH_TOKEN = authval['OAUTH_TOKEN'] \nOAUTH_TOKEN_SECRET = authval['OAUTH_TOKEN_SECRET']\n\nauth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,\n CONSUMER_KEY, CONSUMER_SECRET)\n\ntwitter_api = twitter.Twitter(auth=auth)",
	"language": "python",
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "markdown",
	"source": "<h2> Searching for tweets</h2>"
	},
	{
	"cell_type": "code",
	"collapsed": true,
	"input": "def twitter_search(twitter_api, q, max_results=10, kw):\n \n search_results = twitter_api.search.tweets(q=q, count=100, kw)\n \n statuses = search_results['statuses']\n \n #We have used a small limit due to internet speed concerns\n #Change the limit for practical use\n max_results = min(50, max_results)\n \n for _ in range(10): \n try:\n next_results = search_results['search_metadata']['next_results']\n except KeyError, e: \n break\n \n kwargs = dict([ kv.split('=') \n for kv in next_results[1:].split(\"&\") ])\n \n search_results = twitter_api.search.tweets(**kwargs)\n statuses += search_results['statuses']\n \n if len(statuses) > max_results: \n break\n \n return statuses\n\n ",
	"language": "python",
	"outputs": [],
	"prompt_number": 9
	},
	{
	"cell_type": "markdown",
	"source": "<h2>Extracting tweet-entities</h2> "
	},
	{
	"cell_type": "code",
	"collapsed": true,
	"input": "def extract_tweet_entities(statuses):\n \n if len(statuses) == 0:\n return [], [], [], [], []\n \n status_texts = [ status['text'] \n for status in statuses ]\n \n screen_names = [ user_mention['screen_name'] \n for status in statuses\n for user_mention in status['entities']['user_mentions'] ]\n \n words = [ w \n for t in status_texts \n for w in t.split()]\n \n hashtags = [ hashtag['text'] \n for status in statuses \n for hashtag in status['entities']['hashtags'] ]\n\n urls = [ url['expanded_url'] \n for status in statuses \n for url in status['entities']['urls'] ]\n \n symbols = [ symbol['text']\n for status in statuses\n for symbol in status['entities']['symbols'] ]\n \n if status['entities'].has_key('media'): \n media = [ media['url'] \n for status in statuses \n for media in status['entities']['media'] ]\n else:\n media = []\n\n return status_texts, screen_names, words, hashtags, urls, media, symbols",
	"language": "python",
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "markdown",
	"source": "<br><font size=4>For practical reasons we shall store the tweets in a text file 'statuses.txt'.<br>\nSo that we don't have to fetch tweets again and again.</font><br><br>"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "'''\nq = 'AAP' #Mention the Hashtag here\n\nstatuses = twitter_search(twitter_api, q, 50)\njson.dump(statuses, open(\"statuses.txt\",'w'))\n'''",
	"language": "python",
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 4,
	"text": "'\\nq = \\'AAP\\'\\n\\nstatuses = twitter_search(twitter_api, q, 50)\\njson.dump(statuses, open(\"statuses.txt\",\\'w\\'))\\n'"
	}
	],
	"prompt_number": 4
	},
	{
	"cell_type": "markdown",
	"source": "<br><font size=4>We have already procured few tweets, and are retrieving them from the above mentioned text file</font><br><br>"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "json_data=open('statuses.txt')\n\nstatuses = json.load(json_data)\nfinres=extract_tweet_entities(statuses)\nstatus_texts, screen_names, words, hashtags, urls, media, symbols = extract_tweet_entities(statuses)\n \n# View the first five items for each...\nprint \"Total tweets: \" + str(len(status_texts)) + '\\n\\n'\nprint json.dumps(status_texts[0:5], indent=1)\nprint json.dumps(words[0:5], indent=1)\nprint json.dumps(screen_names[0:5], indent=1) \nprint json.dumps(hashtags[0:5], indent=1)\nprint json.dumps(urls[0:5], indent=1)\nprint json.dumps(media[0:5], indent=1)\nprint json.dumps(symbols[0:5], indent=1)",
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "Total tweets: 198\n\n\n[\n \"Teacher\\nSUch aur Veham main kiya farak hai?\\nBoy:\\nAap hum ko paRha rahi hain\\nye SUCH hai\\nAur\\nhum paRh rahe hain ye aap ka VEHAM hai...!\", \n \"RT @pragnik: Shocking: #AAP IT admin Ankit Lal pays to trend their promoted party hashtags or anti-BJP on Twitter!\\n\\n #HDL #Namo4PM http://t\\u2026\", \n \"RT @preethidb: Get ready Mumbai, Arvind's on his way! http://t.co/BZlQP2Zf4t\", \n \"Ashok Aggarwal quits AAP, says party functioning like a private limited company http://t.co/oYLVddkskU\\\"\", \n \"AAP sabd itna cheap aur badnaam hua hai ki ab izzat dene k liye tum bolna padta hai\"\n]\n[\n \"Teacher\", \n \"SUch\", \n \"aur\", \n \"Veham\", \n \"main\"\n]\n[\n \"pragnik\", \n \"preethidb\", \n \"ashutosh083B\", \n \"ashutosh83B\", \n \"timesofindia\"\n]\n[\n \"AAP\", \n \"HDL\", \n \"Namo4PM\", \n \"AAP\", \n \"KejriwalLies\"\n]\n[\n \"http://timesofindia.indiatimes.com/india/Ashok-Aggarwal-quits-AAP-says-party-functioning-like-a-private-limited-company/articleshow/31838083.cms?utm_source=twitter.com&utm_medium=referral&utm_campaign=timesofindia\", \n \"http://timesofindia.indiatimes.com/india/Ashok-Aggarwal-quits-AAP-says-party-functioning-like-a-private-limited-company/articleshow/31838083.cms?utm_source=twitter.com&utm_medium=referral&utm_campaign=timesofindia\", \n \"http://fb.me/2h8yDhR3L\", \n \"http://timesofindia.indiatimes.com/city/delhi/BJP-may-field-women-to-trip-AAP/articleshow/31805002.cms\", \n \"http://www.niticentral.com/2014/03/11/lessons-gujarat-villagers-taught-to-arvind-kejriwal-198517.html\"\n]\n[]\n[]"
	}
	],
	"prompt_number": 5
	},
	{
	"cell_type": "markdown",
	"source": "<h2>Analyses of Tweets </h2>"
	},
	{
	"cell_type": "code",
	"collapsed": true,
	"input": "import nltk\nimport nltk.classify\n\nposlines= open(r'small.pos', 'r').read().splitlines()\nneglines= open(r'small.neg', 'r').read().splitlines()\ntrainset= [(x,'positive') for x in poslines] + [(x,-1) for x in neglines]\ntweets = []\n\nfor (words, sentiment) in trainset:\n words_filtered = [e.lower() for e in words.split() if len(e) >= 3] \n tweets.append((words_filtered, sentiment))\n\n\nstop_words = open(r'stop-words.txt', 'r').read()\nstop = []\n\nwords_filtered2 = [e.lower() for e in stop_words.split()]\nstop = words_filtered2\n\n \ndef get_words_in_tweets(tweets):\n all_words = []\n for (words, sentiment) in tweets:\n all_words.extend(words)\n return all_words\n \ndef get_word_features(wordlist):\n wordlist = nltk.FreqDist(wordlist)\n word_features = wordlist.keys()\n return word_features\n\nword_features = get_word_features(get_words_in_tweets(tweets))\n\nw=word_features[:]\n\nfor x in word_features:\n if x in stop:\n w.remove(x)\n\n\ndef extract_features(document):\n document_words = set(document)\n features = {}\n for word in w:\n features['contains(%s)' % word] = (word in document_words)\n return features\n\ntraining_set = nltk.classify.apply_features(extract_features, tweets)\n \nclassifier = nltk.NaiveBayesClassifier.train(training_set)\n\nlop=0 #level of positiveness\nlon=0 #level of negativeness\n\nfor tweet in status_texts:\n sentiment = classifier.classify(extract_features(tweet.split()))\n if sentiment=='positive':\n lop=lop+1\n else:\n lon=lon+1",
	"language": "python",
	"outputs": [],
	"prompt_number": 28
	},
	{
	"cell_type": "markdown",
	"source": "<h3>Our Findings:</h3>"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "print '\\n'\nprint \"Level of positiveness: \"+ str(lop/float(len(status_texts)))\nprint \"Level of negativeness: \"+ str(lon/float(len(status_texts)))\n\nif lop>lon:\n print \"\\nThe sentiment for the given hashtag is positive\\n\"\nelse:\n print \"\\nThe sentiment for the given hashtag is negative\\n\" \n ",
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "\n\nLevel of positiveness: 0.691919191919\nLevel of negativeness: 0.308080808081\n\nThe sentiment for the given hashtag is positive"
	}
	],
	"prompt_number": 26
	},
	{
	"cell_type": "code",
	"collapsed": true,
	"input": "",
	"language": "python",
	"outputs": []
	}
	]
	}
	]
	}