Created
February 15, 2015 03:06
-
-
Save amirziai/c117f491f5070558c8bf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:be11b254572e53a3734ec12d696e0e322aac08909693b8bec4dd82eeefb13a20" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import json\n", | |
"import re\n", | |
"import string\n", | |
"from nltk import word_tokenize" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 174 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"class TweetSerializer:\n", | |
" \n", | |
" def __init__(self):\n", | |
" self.out = None\n", | |
" self.first = True\n", | |
" self.count = 0\n", | |
" \n", | |
" def start(self):\n", | |
" self.count += 1\n", | |
" fname = \"tweets-\"+str(self.count)+\".json\"\n", | |
" self.out = open(fname,\"w\")\n", | |
" self.out.write(\"[\\n\")\n", | |
" self.first = True\n", | |
"\n", | |
" def end(self):\n", | |
" if self.out is not None:\n", | |
" self.out.write(\"\\n]\\n\")\n", | |
" self.out.close()\n", | |
" self.out = None\n", | |
"\n", | |
" def write(self,tweet):\n", | |
" if not self.first:\n", | |
" self.out.write(\",\\n\")\n", | |
" self.first = False\n", | |
" self.out.write(json.dumps(tweet).encode('utf8'))\n", | |
" #self.out.write(json.dumps(tweet._json).encode('utf8'))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 30 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"x = TweetSerializer()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 31 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"x.start()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 36 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"x.write('this is a sample tweet')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 38 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"x.end()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 39 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Twitter" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import sys\n", | |
"import datetime\n", | |
"import urllib\n", | |
"import signal\n", | |
"import json" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 42 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import tweepy\n", | |
"consumer_key = \"BmXCpNVAs3K7fALROjRqLCZIx\";\n", | |
"consumer_secret = \"jkSgmklrMlBhtDap3cw7s4Bo1t67irWpkbLeNvYIIxcUr5mNIp\";\n", | |
"access_token = \"1654270760-c3vmF3tTobV5tSpTNnEBKZhCJlimUMhZmTJtOih\";\n", | |
"access_token_secret = \"OUUe7LpjKVRdi8NiwKaqmg3pWPIjt7Xxet3pM4sVsuLhK\";\n", | |
"auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", | |
"auth.set_access_token(access_token, access_token_secret)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 43 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"api = tweepy.API(auth_handler=auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)\n", | |
"\n", | |
"N = 3\n", | |
"q_item = '#microsoft #mojang'\n", | |
"q = urllib.quote_plus(q_item) # URL encoded query\n", | |
"q+= 'since:2014-01-01 until:2015-02-12'\n", | |
"\n", | |
"# Additional query parameters:\n", | |
"# since: {date}\n", | |
"# until: {date}\n", | |
"# Just add them to the 'q' variable: q+\" since: 2014-01-01 until: 2014-01-02\"\n", | |
"\n", | |
"\n", | |
"# james's:\n", | |
"q = '#microsoft AND #mojang since:2015-02-08'\n", | |
"\n", | |
"# katherine's\n", | |
"tweets = tweepy.Cursor(api.search,q=urllib.quote_plus('#microsoft #mojang'),start='2015-02-08').items(N)\n", | |
"\n", | |
"#tweets = tweepy.Cursor(api.search,q=q).items(N)\n", | |
"\n", | |
"serializer = TweetSerializer()\n", | |
"serializer.start()\n", | |
"\n", | |
"for tweet in tweets:\n", | |
" # FYI: JSON is in tweet._json\n", | |
" #print tweet._json\n", | |
" print tweet.text, '\\n'\n", | |
" serializer.write(tweet._json)\n", | |
" \n", | |
"serializer.end()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 44 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Open JSON and extract words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"with open('tweets-1.json') as f:\n", | |
" data = json.load(f)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 45 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"\n", | |
"import re\n", | |
"regex_url = r'(http|https)://\\S+'\n", | |
"\n", | |
"dic = {}\n", | |
"\n", | |
"for d in data:\n", | |
" tokens = word_tokenize(d['text'])\n", | |
" print tokens, '\\n'\n", | |
" for token in tokens:\n", | |
" if token in dic:\n", | |
" dic[token] += 1\n", | |
" else:\n", | |
" dic[token] = 1\n", | |
" #print i['text'], '\\n'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[u'Hey', u'followers', u'\\u2764\\ufe0f', u'join', u'the', u'club', u'\\U0001f618', u'\\u2606', u'#', u'selfie', u'#', u'minecraft', u'#', u'chocker', u'#', u'pennys', u'#', u'bbw', u'#', u'mojang', u'#', u'microsoft', u'#', u'naruto', u'#', u'pocketbacs\\u2026', u'http', u':', u'//t.co/QZWL50K4Ff'] \n", | |
"\n", | |
"[u'http', u':', u'//t.co/ZzSEK3UhQL', u'For', u'sale', u'https', u':', u'//t.co/yS52UKlYqX', u'#', u'Apple', u'#', u'Startup', u'#', u'Vegas', u'#', u'bigdata', u'#', u'Microsoft', u'#', u'google', u'#', u'investor', u'#', u'UK', u'#', u'Amazon', u'#', u'Mojang'] \n", | |
"\n", | |
"[u'http', u':', u'//t.co/YFo8wxhmGt', u'For', u'sale', u'https', u':', u'//t.co/WPmD0AqH6W', u'#', u'Apple', u'#', u'Startup', u'#', u'Vegas', u'#', u'bigdata', u'#', u'Microsoft', u'#', u'google', u'#', u'investor', u'#', u'UK', u'#', u'Amazon', u'#', u'Mojang'] \n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 60 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"txt = data[0]['text']\n", | |
"#remove_urls_usernames(txt)\n", | |
"\n", | |
"yy = remove_stopwords(remove_punctuation(remove_urls_usernames(txt)))\n", | |
"print yy" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"['hey', 'followers', 'join', 'cl', 'selfie', 'minecraft', 'chocker', 'pennys', 'bbw', 'mojang', 'microsoft', 'nar', 'pocketbacs']\n" | |
] | |
} | |
], | |
"prompt_number": 173 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Text processing procedure" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def remove_urls_usernames(txt):\n", | |
" # escape unicode characters\n", | |
" txt = txt.encode('utf-8').decode('unicode_escape').encode('ascii','ignore')\n", | |
" \n", | |
" regex_url = r'(http|https)://\\S+'\n", | |
" regex_username = r'@\\S+'\n", | |
"\n", | |
" txt_re = re.sub(regex_url, '', txt)\n", | |
" txt_re = re.sub(regex_unicode, '', txt_re)\n", | |
" txt_re = re.sub(regex_username, '', txt_re)\n", | |
" \n", | |
" return [x.lower() for x in word_tokenize(txt_re)]\n", | |
" #return filter(lambda x:x!='#', xxx)\n", | |
"\n", | |
"def remove_punctuation(tokenized_docs):\n", | |
" regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n", | |
"\n", | |
" tokenized_docs_no_punctuation = []\n", | |
"\n", | |
" for token in tokenized_docs: \n", | |
" new_token = regex.sub(u'', token)\n", | |
" if not new_token == u'':\n", | |
" tokenized_docs_no_punctuation.append(new_token)\n", | |
" \n", | |
" return tokenized_docs_no_punctuation\n", | |
"\n", | |
"def remove_stopwords(tokenized_docs_no_punctuation):\n", | |
" from nltk.corpus import stopwords\n", | |
" tokenized_docs_no_stopwords = []\n", | |
"\n", | |
" for word in tokenized_docs_no_punctuation:\n", | |
" if not word in stopwords.words('english'):\n", | |
" tokenized_docs_no_stopwords.append(word)\n", | |
" \n", | |
" return tokenized_docs_no_stopwords" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 171 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment