Skip to content

Instantly share code, notes, and snippets.

@amirziai
Created February 15, 2015 07:53
Show Gist options
  • Save amirziai/0fade9e28a1bbbf47e61 to your computer and use it in GitHub Desktop.
Save amirziai/0fade9e28a1bbbf47e61 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "",
"signature": "sha256:a00d2f7efeedaf8c3cba9f566d27250aa64c6cd2676cbf07d7c7b6fd0a293673"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import json\n",
"import string\n",
"import re\n",
"from nltk import word_tokenize\n",
"import boto\n",
"from boto.s3.key import Key\n",
"import pylab as pl\n",
"import numpy as np"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"AWS Connection Credentials"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"aws_key = ''\n",
"aws_secret = ''"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Text cleanup functions"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def remove_urls_usernames(txt):\n",
" # escape unicode characters\n",
" txt = txt.encode('utf-8').decode('unicode_escape').encode('ascii', 'ignore')\n",
"\n",
" regex_url = r'(http|https)://\\S+'\n",
" regex_username = r'@\\S+'\n",
" txt_re = re.sub(regex_url, '', txt)\n",
" txt_re = re.sub(regex_username, '', txt_re)\n",
"\n",
" return [x.lower() for x in word_tokenize(txt_re)]\n",
"\n",
"\n",
"def remove_punctuation(tokenized_docs):\n",
" regex = re.compile('[%s]' % re.escape(string.punctuation))\n",
"\n",
" tokenized_docs_no_punctuation = []\n",
"\n",
" for token in tokenized_docs:\n",
" new_token = regex.sub(u'', token)\n",
" if not new_token == u'':\n",
" tokenized_docs_no_punctuation.append(new_token)\n",
"\n",
" return tokenized_docs_no_punctuation\n",
"\n",
"\n",
"def remove_stopwords(tokenized_docs_no_punctuation):\n",
" from nltk.corpus import stopwords\n",
" tokenized_docs_no_stopwords = []\n",
"\n",
" for word in tokenized_docs_no_punctuation:\n",
" if not word in stopwords.words('english'):\n",
" tokenized_docs_no_stopwords.append(word)\n",
"\n",
" return tokenized_docs_no_stopwords"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1- Read all the files from the bucket\n",
"<br>\n",
"2- Load the JSON content and grab the text field\n",
"<br>\n",
"3- Tokenize the text and do some cleanup (remove URLs, unicode characters, ...)\n",
"<br>\n",
"4- Count the number of times each word appeared across all files\n",
"<br>\n",
"5- Show a bar chart of word frequency (only for words appearing more four or more times)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
" # s3 bucket\n",
" c = boto.connect_s3(aws_key, aws_secret)\n",
" b = c.get_bucket('amirziai-mids-w205-assignment2')\n",
"\n",
" # iterate over all bucket keys (files)\n",
" word_dic = {}\n",
" for k in b.list():\n",
" key = Key(b)\n",
" for tweet in json.loads(k.get_contents_as_string()):\n",
" # tokenize and clean up\n",
" words = remove_stopwords(remove_punctuation(remove_urls_usernames(tweet['text'])))\n",
" \n",
" # word count\n",
" for w in words:\n",
" if w in word_dic:\n",
" word_dic[w] += 1\n",
" else:\n",
" word_dic[w] = 1\n",
"\n",
" # print word_dic\n",
" show_higher_than = 10 # just making sure there are not too many words on the plot\n",
" word_dic_filtered = {k: v for k, v in word_dic.iteritems() if v > show_higher_than}\n",
" X = np.arange(len(word_dic_filtered))\n",
" pl.bar(X, word_dic_filtered.values(), align='center', width=0.5)\n",
" pl.xticks(X, word_dic_filtered.keys())\n",
" ymax = max(word_dic_filtered.values()) + 1\n",
" pl.ylim(0, ymax)\n",
" pl.show()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment