Created
February 15, 2015 07:53
-
-
Save amirziai/0fade9e28a1bbbf47e61 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:a00d2f7efeedaf8c3cba9f566d27250aa64c6cd2676cbf07d7c7b6fd0a293673" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import json\n", | |
"import string\n", | |
"import re\n", | |
"from nltk import word_tokenize\n", | |
"import boto\n", | |
"from boto.s3.key import Key\n", | |
"import pylab as pl\n", | |
"import numpy as np" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"AWS Connection Credentials" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"aws_key = ''\n", | |
"aws_secret = ''" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Text cleanup functions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def remove_urls_usernames(txt):\n", | |
" # escape unicode characters\n", | |
" txt = txt.encode('utf-8').decode('unicode_escape').encode('ascii', 'ignore')\n", | |
"\n", | |
" regex_url = r'(http|https)://\\S+'\n", | |
" regex_username = r'@\\S+'\n", | |
" txt_re = re.sub(regex_url, '', txt)\n", | |
" txt_re = re.sub(regex_username, '', txt_re)\n", | |
"\n", | |
" return [x.lower() for x in word_tokenize(txt_re)]\n", | |
"\n", | |
"\n", | |
"def remove_punctuation(tokenized_docs):\n", | |
" regex = re.compile('[%s]' % re.escape(string.punctuation))\n", | |
"\n", | |
" tokenized_docs_no_punctuation = []\n", | |
"\n", | |
" for token in tokenized_docs:\n", | |
" new_token = regex.sub(u'', token)\n", | |
" if not new_token == u'':\n", | |
" tokenized_docs_no_punctuation.append(new_token)\n", | |
"\n", | |
" return tokenized_docs_no_punctuation\n", | |
"\n", | |
"\n", | |
"def remove_stopwords(tokenized_docs_no_punctuation):\n", | |
" from nltk.corpus import stopwords\n", | |
" tokenized_docs_no_stopwords = []\n", | |
"\n", | |
" for word in tokenized_docs_no_punctuation:\n", | |
" if not word in stopwords.words('english'):\n", | |
" tokenized_docs_no_stopwords.append(word)\n", | |
"\n", | |
" return tokenized_docs_no_stopwords" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Analysis" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"1- Read all the files from the bucket\n", | |
"<br>\n", | |
"2- Load the JSON content and grab the text field\n", | |
"<br>\n", | |
"3- Tokenize the text and do some cleanup (remove URLs, unicode characters, ...)\n", | |
"<br>\n", | |
"4- Count the number of times each word appeared across all files\n", | |
"<br>\n", | |
"5- Show a bar chart of word frequency (only for words appearing more four or more times)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
" # s3 bucket\n", | |
" c = boto.connect_s3(aws_key, aws_secret)\n", | |
" b = c.get_bucket('amirziai-mids-w205-assignment2')\n", | |
"\n", | |
" # iterate over all bucket keys (files)\n", | |
" word_dic = {}\n", | |
" for k in b.list():\n", | |
" key = Key(b)\n", | |
" for tweet in json.loads(k.get_contents_as_string()):\n", | |
" # tokenize and clean up\n", | |
" words = remove_stopwords(remove_punctuation(remove_urls_usernames(tweet['text'])))\n", | |
" \n", | |
" # word count\n", | |
" for w in words:\n", | |
" if w in word_dic:\n", | |
" word_dic[w] += 1\n", | |
" else:\n", | |
" word_dic[w] = 1\n", | |
"\n", | |
" # print word_dic\n", | |
" show_higher_than = 10 # just making sure there are not too many words on the plot\n", | |
" word_dic_filtered = {k: v for k, v in word_dic.iteritems() if v > show_higher_than}\n", | |
" X = np.arange(len(word_dic_filtered))\n", | |
" pl.bar(X, word_dic_filtered.values(), align='center', width=0.5)\n", | |
" pl.xticks(X, word_dic_filtered.keys())\n", | |
" ymax = max(word_dic_filtered.values()) + 1\n", | |
" pl.ylim(0, ymax)\n", | |
" pl.show()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment