amirziai/gist:0fade9e28a1bbbf47e61

## gistfile1.txt
{
 "metadata": {
  "name": "",
  "signature": "sha256:a00d2f7efeedaf8c3cba9f566d27250aa64c6cd2676cbf07d7c7b6fd0a293673"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import json\n",
      "import string\n",
      "import re\n",
      "from nltk import word_tokenize\n",
      "import boto\n",
      "from boto.s3.key import Key\n",
      "import pylab as pl\n",
      "import numpy as np"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "AWS Connection Credentials"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "aws_key = ''\n",
      "aws_secret = ''"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Text cleanup functions"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def remove_urls_usernames(txt):\n",
      "    # escape unicode characters\n",
      "    txt = txt.encode('utf-8').decode('unicode_escape').encode('ascii', 'ignore')\n",
      "\n",
      "    regex_url = r'(http|https)://\\S+'\n",
      "    regex_username = r'@\\S+'\n",
      "    txt_re = re.sub(regex_url, '', txt)\n",
      "    txt_re = re.sub(regex_username, '', txt_re)\n",
      "\n",
      "    return [x.lower() for x in word_tokenize(txt_re)]\n",
      "\n",
      "\n",
      "def remove_punctuation(tokenized_docs):\n",
      "    regex = re.compile('[%s]' % re.escape(string.punctuation))\n",
      "\n",
      "    tokenized_docs_no_punctuation = []\n",
      "\n",
      "    for token in tokenized_docs:\n",
      "        new_token = regex.sub(u'', token)\n",
      "        if not new_token == u'':\n",
      "            tokenized_docs_no_punctuation.append(new_token)\n",
      "\n",
      "    return tokenized_docs_no_punctuation\n",
      "\n",
      "\n",
      "def remove_stopwords(tokenized_docs_no_punctuation):\n",
      "    from nltk.corpus import stopwords\n",
      "    tokenized_docs_no_stopwords = []\n",
      "\n",
      "    for word in tokenized_docs_no_punctuation:\n",
      "        if not word in stopwords.words('english'):\n",
      "            tokenized_docs_no_stopwords.append(word)\n",
      "\n",
      "    return tokenized_docs_no_stopwords"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Analysis"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "1- Read all the files from the bucket\n",
      "<br>\n",
      "2- Load the JSON content and grab the text field\n",
      "<br>\n",
      "3- Tokenize the text and do some cleanup (remove URLs, unicode characters, ...)\n",
      "<br>\n",
      "4- Count the number of times each word appeared across all files\n",
      "<br>\n",
      "5- Show a bar chart of word frequency (only for words appearing more four or more times)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "    # s3 bucket\n",
      "    c = boto.connect_s3(aws_key, aws_secret)\n",
      "    b = c.get_bucket('amirziai-mids-w205-assignment2')\n",
      "\n",
      "    # iterate over all bucket keys (files)\n",
      "    word_dic = {}\n",
      "    for k in b.list():\n",
      "        key = Key(b)\n",
      "        for tweet in json.loads(k.get_contents_as_string()):\n",
      "            # tokenize and clean up\n",
      "            words = remove_stopwords(remove_punctuation(remove_urls_usernames(tweet['text'])))\n",
      "            \n",
      "            # word count\n",
      "            for w in words:\n",
      "                if w in word_dic:\n",
      "                    word_dic[w] += 1\n",
      "                else:\n",
      "                    word_dic[w] = 1\n",
      "\n",
      "    # print word_dic\n",
      "    show_higher_than = 10  # just making sure there are not too many words on the plot\n",
      "    word_dic_filtered = {k: v for k, v in word_dic.iteritems() if v > show_higher_than}\n",
      "    X = np.arange(len(word_dic_filtered))\n",
      "    pl.bar(X, word_dic_filtered.values(), align='center', width=0.5)\n",
      "    pl.xticks(X, word_dic_filtered.keys())\n",
      "    ymax = max(word_dic_filtered.values()) + 1\n",
      "    pl.ylim(0, ymax)\n",
      "    pl.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": "",
	"signature": "sha256:a00d2f7efeedaf8c3cba9f566d27250aa64c6cd2676cbf07d7c7b6fd0a293673"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import json\n",
	"import string\n",
	"import re\n",
	"from nltk import word_tokenize\n",
	"import boto\n",
	"from boto.s3.key import Key\n",
	"import pylab as pl\n",
	"import numpy as np"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"AWS Connection Credentials"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"aws_key = ''\n",
	"aws_secret = ''"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Text cleanup functions"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def remove_urls_usernames(txt):\n",
	" # escape unicode characters\n",
	" txt = txt.encode('utf-8').decode('unicode_escape').encode('ascii', 'ignore')\n",
	"\n",
	" regex_url = r'(http\|https)://\\S+'\n",
	" regex_username = r'@\\S+'\n",
	" txt_re = re.sub(regex_url, '', txt)\n",
	" txt_re = re.sub(regex_username, '', txt_re)\n",
	"\n",
	" return [x.lower() for x in word_tokenize(txt_re)]\n",
	"\n",
	"\n",
	"def remove_punctuation(tokenized_docs):\n",
	" regex = re.compile('[%s]' % re.escape(string.punctuation))\n",
	"\n",
	" tokenized_docs_no_punctuation = []\n",
	"\n",
	" for token in tokenized_docs:\n",
	" new_token = regex.sub(u'', token)\n",
	" if not new_token == u'':\n",
	" tokenized_docs_no_punctuation.append(new_token)\n",
	"\n",
	" return tokenized_docs_no_punctuation\n",
	"\n",
	"\n",
	"def remove_stopwords(tokenized_docs_no_punctuation):\n",
	" from nltk.corpus import stopwords\n",
	" tokenized_docs_no_stopwords = []\n",
	"\n",
	" for word in tokenized_docs_no_punctuation:\n",
	" if not word in stopwords.words('english'):\n",
	" tokenized_docs_no_stopwords.append(word)\n",
	"\n",
	" return tokenized_docs_no_stopwords"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Analysis"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"1- Read all the files from the bucket\n",
	"<br>\n",
	"2- Load the JSON content and grab the text field\n",
	"<br>\n",
	"3- Tokenize the text and do some cleanup (remove URLs, unicode characters, ...)\n",
	"<br>\n",
	"4- Count the number of times each word appeared across all files\n",
	"<br>\n",
	"5- Show a bar chart of word frequency (only for words appearing more four or more times)"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	" # s3 bucket\n",
	" c = boto.connect_s3(aws_key, aws_secret)\n",
	" b = c.get_bucket('amirziai-mids-w205-assignment2')\n",
	"\n",
	" # iterate over all bucket keys (files)\n",
	" word_dic = {}\n",
	" for k in b.list():\n",
	" key = Key(b)\n",
	" for tweet in json.loads(k.get_contents_as_string()):\n",
	" # tokenize and clean up\n",
	" words = remove_stopwords(remove_punctuation(remove_urls_usernames(tweet['text'])))\n",
	" \n",
	" # word count\n",
	" for w in words:\n",
	" if w in word_dic:\n",
	" word_dic[w] += 1\n",
	" else:\n",
	" word_dic[w] = 1\n",
	"\n",
	" # print word_dic\n",
	" show_higher_than = 10 # just making sure there are not too many words on the plot\n",
	" word_dic_filtered = {k: v for k, v in word_dic.iteritems() if v > show_higher_than}\n",
	" X = np.arange(len(word_dic_filtered))\n",
	" pl.bar(X, word_dic_filtered.values(), align='center', width=0.5)\n",
	" pl.xticks(X, word_dic_filtered.keys())\n",
	" ymax = max(word_dic_filtered.values()) + 1\n",
	" pl.ylim(0, ymax)\n",
	" pl.show()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 4
	}
	],
	"metadata": {}
	}
	]
	}