amirziai/gist:c117f491f5070558c8bf

## gistfile1.txt
{
 "metadata": {
  "name": "",
  "signature": "sha256:be11b254572e53a3734ec12d696e0e322aac08909693b8bec4dd82eeefb13a20"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import json\n",
      "import re\n",
      "import string\n",
      "from nltk import word_tokenize"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 174
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "class TweetSerializer:\n",
      "    \n",
      "    def __init__(self):\n",
      "        self.out = None\n",
      "        self.first = True\n",
      "        self.count = 0\n",
      "    \n",
      "    def start(self):\n",
      "        self.count += 1\n",
      "        fname = \"tweets-\"+str(self.count)+\".json\"\n",
      "        self.out = open(fname,\"w\")\n",
      "        self.out.write(\"[\\n\")\n",
      "        self.first = True\n",
      "\n",
      "    def end(self):\n",
      "        if self.out is not None:\n",
      "            self.out.write(\"\\n]\\n\")\n",
      "            self.out.close()\n",
      "        self.out = None\n",
      "\n",
      "    def write(self,tweet):\n",
      "        if not self.first:\n",
      "            self.out.write(\",\\n\")\n",
      "        self.first = False\n",
      "        self.out.write(json.dumps(tweet).encode('utf8'))\n",
      "        #self.out.write(json.dumps(tweet._json).encode('utf8'))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 30
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "x = TweetSerializer()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 31
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "x.start()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 36
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "x.write('this is a sample tweet')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 38
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "x.end()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 39
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Twitter"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import sys\n",
      "import datetime\n",
      "import urllib\n",
      "import signal\n",
      "import json"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 42
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import tweepy\n",
      "consumer_key = \"BmXCpNVAs3K7fALROjRqLCZIx\";\n",
      "consumer_secret = \"jkSgmklrMlBhtDap3cw7s4Bo1t67irWpkbLeNvYIIxcUr5mNIp\";\n",
      "access_token = \"1654270760-c3vmF3tTobV5tSpTNnEBKZhCJlimUMhZmTJtOih\";\n",
      "access_token_secret = \"OUUe7LpjKVRdi8NiwKaqmg3pWPIjt7Xxet3pM4sVsuLhK\";\n",
      "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
      "auth.set_access_token(access_token, access_token_secret)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 43
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "api = tweepy.API(auth_handler=auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)\n",
      "\n",
      "N = 3\n",
      "q_item = '#microsoft #mojang'\n",
      "q = urllib.quote_plus(q_item)  # URL encoded query\n",
      "q+= 'since:2014-01-01 until:2015-02-12'\n",
      "\n",
      "# Additional query parameters:\n",
      "#   since: {date}\n",
      "#   until: {date}\n",
      "# Just add them to the 'q' variable: q+\" since: 2014-01-01 until: 2014-01-02\"\n",
      "\n",
      "\n",
      "# james's:\n",
      "q = '#microsoft AND #mojang since:2015-02-08'\n",
      "\n",
      "# katherine's\n",
      "tweets = tweepy.Cursor(api.search,q=urllib.quote_plus('#microsoft #mojang'),start='2015-02-08').items(N)\n",
      "\n",
      "#tweets = tweepy.Cursor(api.search,q=q).items(N)\n",
      "\n",
      "serializer = TweetSerializer()\n",
      "serializer.start()\n",
      "\n",
      "for tweet in tweets:\n",
      "   # FYI: JSON is in tweet._json\n",
      "    #print tweet._json\n",
      "    print tweet.text, '\\n'\n",
      "    serializer.write(tweet._json)\n",
      "    \n",
      "serializer.end()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 44
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Open JSON and extract words"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "with open('tweets-1.json') as f:\n",
      "    data = json.load(f)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 45
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\n",
      "import re\n",
      "regex_url = r'(http|https)://\\S+'\n",
      "\n",
      "dic = {}\n",
      "\n",
      "for d in data:\n",
      "    tokens = word_tokenize(d['text'])\n",
      "    print tokens, '\\n'\n",
      "    for token in tokens:\n",
      "        if token in dic:\n",
      "            dic[token] += 1\n",
      "        else:\n",
      "            dic[token] = 1\n",
      "    #print i['text'], '\\n'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[u'Hey', u'followers', u'\\u2764\\ufe0f', u'join', u'the', u'club', u'\\U0001f618', u'\\u2606', u'#', u'selfie', u'#', u'minecraft', u'#', u'chocker', u'#', u'pennys', u'#', u'bbw', u'#', u'mojang', u'#', u'microsoft', u'#', u'naruto', u'#', u'pocketbacs\\u2026', u'http', u':', u'//t.co/QZWL50K4Ff'] \n",
        "\n",
        "[u'http', u':', u'//t.co/ZzSEK3UhQL', u'For', u'sale', u'https', u':', u'//t.co/yS52UKlYqX', u'#', u'Apple', u'#', u'Startup', u'#', u'Vegas', u'#', u'bigdata', u'#', u'Microsoft', u'#', u'google', u'#', u'investor', u'#', u'UK', u'#', u'Amazon', u'#', u'Mojang'] \n",
        "\n",
        "[u'http', u':', u'//t.co/YFo8wxhmGt', u'For', u'sale', u'https', u':', u'//t.co/WPmD0AqH6W', u'#', u'Apple', u'#', u'Startup', u'#', u'Vegas', u'#', u'bigdata', u'#', u'Microsoft', u'#', u'google', u'#', u'investor', u'#', u'UK', u'#', u'Amazon', u'#', u'Mojang'] \n",
        "\n"
       ]
      }
     ],
     "prompt_number": 60
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "txt = data[0]['text']\n",
      "#remove_urls_usernames(txt)\n",
      "\n",
      "yy = remove_stopwords(remove_punctuation(remove_urls_usernames(txt)))\n",
      "print yy"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['hey', 'followers', 'join', 'cl', 'selfie', 'minecraft', 'chocker', 'pennys', 'bbw', 'mojang', 'microsoft', 'nar', 'pocketbacs']\n"
       ]
      }
     ],
     "prompt_number": 173
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Text processing procedure"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def remove_urls_usernames(txt):\n",
      "    # escape unicode characters\n",
      "    txt = txt.encode('utf-8').decode('unicode_escape').encode('ascii','ignore')\n",
      "    \n",
      "    regex_url = r'(http|https)://\\S+'\n",
      "    regex_username = r'@\\S+'\n",
      "\n",
      "    txt_re = re.sub(regex_url, '', txt)\n",
      "    txt_re = re.sub(regex_unicode, '', txt_re)\n",
      "    txt_re = re.sub(regex_username, '', txt_re)\n",
      "    \n",
      "    return [x.lower() for x in word_tokenize(txt_re)]\n",
      "    #return filter(lambda x:x!='#', xxx)\n",
      "\n",
      "def remove_punctuation(tokenized_docs):\n",
      "    regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n",
      "\n",
      "    tokenized_docs_no_punctuation = []\n",
      "\n",
      "    for token in tokenized_docs: \n",
      "        new_token = regex.sub(u'', token)\n",
      "        if not new_token == u'':\n",
      "            tokenized_docs_no_punctuation.append(new_token)\n",
      "        \n",
      "    return tokenized_docs_no_punctuation\n",
      "\n",
      "def remove_stopwords(tokenized_docs_no_punctuation):\n",
      "    from nltk.corpus import stopwords\n",
      "    tokenized_docs_no_stopwords = []\n",
      "\n",
      "    for word in tokenized_docs_no_punctuation:\n",
      "        if not word in stopwords.words('english'):\n",
      "            tokenized_docs_no_stopwords.append(word)\n",
      "        \n",
      "    return tokenized_docs_no_stopwords"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 171
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": "",
	"signature": "sha256:be11b254572e53a3734ec12d696e0e322aac08909693b8bec4dd82eeefb13a20"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import json\n",
	"import re\n",
	"import string\n",
	"from nltk import word_tokenize"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 174
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"class TweetSerializer:\n",
	" \n",
	" def __init__(self):\n",
	" self.out = None\n",
	" self.first = True\n",
	" self.count = 0\n",
	" \n",
	" def start(self):\n",
	" self.count += 1\n",
	" fname = \"tweets-\"+str(self.count)+\".json\"\n",
	" self.out = open(fname,\"w\")\n",
	" self.out.write(\"[\\n\")\n",
	" self.first = True\n",
	"\n",
	" def end(self):\n",
	" if self.out is not None:\n",
	" self.out.write(\"\\n]\\n\")\n",
	" self.out.close()\n",
	" self.out = None\n",
	"\n",
	" def write(self,tweet):\n",
	" if not self.first:\n",
	" self.out.write(\",\\n\")\n",
	" self.first = False\n",
	" self.out.write(json.dumps(tweet).encode('utf8'))\n",
	" #self.out.write(json.dumps(tweet._json).encode('utf8'))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 30
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"x = TweetSerializer()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 31
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"x.start()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 36
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"x.write('this is a sample tweet')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 38
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"x.end()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 39
	},
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"Twitter"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import sys\n",
	"import datetime\n",
	"import urllib\n",
	"import signal\n",
	"import json"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 42
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import tweepy\n",
	"consumer_key = \"BmXCpNVAs3K7fALROjRqLCZIx\";\n",
	"consumer_secret = \"jkSgmklrMlBhtDap3cw7s4Bo1t67irWpkbLeNvYIIxcUr5mNIp\";\n",
	"access_token = \"1654270760-c3vmF3tTobV5tSpTNnEBKZhCJlimUMhZmTJtOih\";\n",
	"access_token_secret = \"OUUe7LpjKVRdi8NiwKaqmg3pWPIjt7Xxet3pM4sVsuLhK\";\n",
	"auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
	"auth.set_access_token(access_token, access_token_secret)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 43
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"api = tweepy.API(auth_handler=auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)\n",
	"\n",
	"N = 3\n",
	"q_item = '#microsoft #mojang'\n",
	"q = urllib.quote_plus(q_item) # URL encoded query\n",
	"q+= 'since:2014-01-01 until:2015-02-12'\n",
	"\n",
	"# Additional query parameters:\n",
	"# since: {date}\n",
	"# until: {date}\n",
	"# Just add them to the 'q' variable: q+\" since: 2014-01-01 until: 2014-01-02\"\n",
	"\n",
	"\n",
	"# james's:\n",
	"q = '#microsoft AND #mojang since:2015-02-08'\n",
	"\n",
	"# katherine's\n",
	"tweets = tweepy.Cursor(api.search,q=urllib.quote_plus('#microsoft #mojang'),start='2015-02-08').items(N)\n",
	"\n",
	"#tweets = tweepy.Cursor(api.search,q=q).items(N)\n",
	"\n",
	"serializer = TweetSerializer()\n",
	"serializer.start()\n",
	"\n",
	"for tweet in tweets:\n",
	" # FYI: JSON is in tweet._json\n",
	" #print tweet._json\n",
	" print tweet.text, '\\n'\n",
	" serializer.write(tweet._json)\n",
	" \n",
	"serializer.end()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 44
	},
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"Open JSON and extract words"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"with open('tweets-1.json') as f:\n",
	" data = json.load(f)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 45
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"\n",
	"import re\n",
	"regex_url = r'(http\|https)://\\S+'\n",
	"\n",
	"dic = {}\n",
	"\n",
	"for d in data:\n",
	" tokens = word_tokenize(d['text'])\n",
	" print tokens, '\\n'\n",
	" for token in tokens:\n",
	" if token in dic:\n",
	" dic[token] += 1\n",
	" else:\n",
	" dic[token] = 1\n",
	" #print i['text'], '\\n'"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[u'Hey', u'followers', u'\\u2764\\ufe0f', u'join', u'the', u'club', u'\\U0001f618', u'\\u2606', u'#', u'selfie', u'#', u'minecraft', u'#', u'chocker', u'#', u'pennys', u'#', u'bbw', u'#', u'mojang', u'#', u'microsoft', u'#', u'naruto', u'#', u'pocketbacs\\u2026', u'http', u':', u'//t.co/QZWL50K4Ff'] \n",
	"\n",
	"[u'http', u':', u'//t.co/ZzSEK3UhQL', u'For', u'sale', u'https', u':', u'//t.co/yS52UKlYqX', u'#', u'Apple', u'#', u'Startup', u'#', u'Vegas', u'#', u'bigdata', u'#', u'Microsoft', u'#', u'google', u'#', u'investor', u'#', u'UK', u'#', u'Amazon', u'#', u'Mojang'] \n",
	"\n",
	"[u'http', u':', u'//t.co/YFo8wxhmGt', u'For', u'sale', u'https', u':', u'//t.co/WPmD0AqH6W', u'#', u'Apple', u'#', u'Startup', u'#', u'Vegas', u'#', u'bigdata', u'#', u'Microsoft', u'#', u'google', u'#', u'investor', u'#', u'UK', u'#', u'Amazon', u'#', u'Mojang'] \n",
	"\n"
	]
	}
	],
	"prompt_number": 60
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"txt = data[0]['text']\n",
	"#remove_urls_usernames(txt)\n",
	"\n",
	"yy = remove_stopwords(remove_punctuation(remove_urls_usernames(txt)))\n",
	"print yy"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"['hey', 'followers', 'join', 'cl', 'selfie', 'minecraft', 'chocker', 'pennys', 'bbw', 'mojang', 'microsoft', 'nar', 'pocketbacs']\n"
	]
	}
	],
	"prompt_number": 173
	},
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"Text processing procedure"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def remove_urls_usernames(txt):\n",
	" # escape unicode characters\n",
	" txt = txt.encode('utf-8').decode('unicode_escape').encode('ascii','ignore')\n",
	" \n",
	" regex_url = r'(http\|https)://\\S+'\n",
	" regex_username = r'@\\S+'\n",
	"\n",
	" txt_re = re.sub(regex_url, '', txt)\n",
	" txt_re = re.sub(regex_unicode, '', txt_re)\n",
	" txt_re = re.sub(regex_username, '', txt_re)\n",
	" \n",
	" return [x.lower() for x in word_tokenize(txt_re)]\n",
	" #return filter(lambda x:x!='#', xxx)\n",
	"\n",
	"def remove_punctuation(tokenized_docs):\n",
	" regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n",
	"\n",
	" tokenized_docs_no_punctuation = []\n",
	"\n",
	" for token in tokenized_docs: \n",
	" new_token = regex.sub(u'', token)\n",
	" if not new_token == u'':\n",
	" tokenized_docs_no_punctuation.append(new_token)\n",
	" \n",
	" return tokenized_docs_no_punctuation\n",
	"\n",
	"def remove_stopwords(tokenized_docs_no_punctuation):\n",
	" from nltk.corpus import stopwords\n",
	" tokenized_docs_no_stopwords = []\n",
	"\n",
	" for word in tokenized_docs_no_punctuation:\n",
	" if not word in stopwords.words('english'):\n",
	" tokenized_docs_no_stopwords.append(word)\n",
	" \n",
	" return tokenized_docs_no_stopwords"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 171
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}