Skip to content

Instantly share code, notes, and snippets.

@amirziai
Created February 15, 2015 03:06
Show Gist options
  • Save amirziai/c117f491f5070558c8bf to your computer and use it in GitHub Desktop.
Save amirziai/c117f491f5070558c8bf to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "",
"signature": "sha256:be11b254572e53a3734ec12d696e0e322aac08909693b8bec4dd82eeefb13a20"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import json\n",
"import re\n",
"import string\n",
"from nltk import word_tokenize"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 174
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"class TweetSerializer:\n",
" \n",
" def __init__(self):\n",
" self.out = None\n",
" self.first = True\n",
" self.count = 0\n",
" \n",
" def start(self):\n",
" self.count += 1\n",
" fname = \"tweets-\"+str(self.count)+\".json\"\n",
" self.out = open(fname,\"w\")\n",
" self.out.write(\"[\\n\")\n",
" self.first = True\n",
"\n",
" def end(self):\n",
" if self.out is not None:\n",
" self.out.write(\"\\n]\\n\")\n",
" self.out.close()\n",
" self.out = None\n",
"\n",
" def write(self,tweet):\n",
" if not self.first:\n",
" self.out.write(\",\\n\")\n",
" self.first = False\n",
" self.out.write(json.dumps(tweet).encode('utf8'))\n",
" #self.out.write(json.dumps(tweet._json).encode('utf8'))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 30
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"x = TweetSerializer()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"x.start()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 36
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"x.write('this is a sample tweet')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"x.end()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 39
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Twitter"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import sys\n",
"import datetime\n",
"import urllib\n",
"import signal\n",
"import json"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import tweepy\n",
"consumer_key = \"BmXCpNVAs3K7fALROjRqLCZIx\";\n",
"consumer_secret = \"jkSgmklrMlBhtDap3cw7s4Bo1t67irWpkbLeNvYIIxcUr5mNIp\";\n",
"access_token = \"1654270760-c3vmF3tTobV5tSpTNnEBKZhCJlimUMhZmTJtOih\";\n",
"access_token_secret = \"OUUe7LpjKVRdi8NiwKaqmg3pWPIjt7Xxet3pM4sVsuLhK\";\n",
"auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
"auth.set_access_token(access_token, access_token_secret)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 43
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"api = tweepy.API(auth_handler=auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)\n",
"\n",
"N = 3\n",
"q_item = '#microsoft #mojang'\n",
"q = urllib.quote_plus(q_item) # URL encoded query\n",
"q+= 'since:2014-01-01 until:2015-02-12'\n",
"\n",
"# Additional query parameters:\n",
"# since: {date}\n",
"# until: {date}\n",
"# Just add them to the 'q' variable: q+\" since: 2014-01-01 until: 2014-01-02\"\n",
"\n",
"\n",
"# james's:\n",
"q = '#microsoft AND #mojang since:2015-02-08'\n",
"\n",
"# katherine's\n",
"tweets = tweepy.Cursor(api.search,q=urllib.quote_plus('#microsoft #mojang'),start='2015-02-08').items(N)\n",
"\n",
"#tweets = tweepy.Cursor(api.search,q=q).items(N)\n",
"\n",
"serializer = TweetSerializer()\n",
"serializer.start()\n",
"\n",
"for tweet in tweets:\n",
" # FYI: JSON is in tweet._json\n",
" #print tweet._json\n",
" print tweet.text, '\\n'\n",
" serializer.write(tweet._json)\n",
" \n",
"serializer.end()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 44
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Open JSON and extract words"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with open('tweets-1.json') as f:\n",
" data = json.load(f)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 45
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"\n",
"import re\n",
"regex_url = r'(http|https)://\\S+'\n",
"\n",
"dic = {}\n",
"\n",
"for d in data:\n",
" tokens = word_tokenize(d['text'])\n",
" print tokens, '\\n'\n",
" for token in tokens:\n",
" if token in dic:\n",
" dic[token] += 1\n",
" else:\n",
" dic[token] = 1\n",
" #print i['text'], '\\n'"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[u'Hey', u'followers', u'\\u2764\\ufe0f', u'join', u'the', u'club', u'\\U0001f618', u'\\u2606', u'#', u'selfie', u'#', u'minecraft', u'#', u'chocker', u'#', u'pennys', u'#', u'bbw', u'#', u'mojang', u'#', u'microsoft', u'#', u'naruto', u'#', u'pocketbacs\\u2026', u'http', u':', u'//t.co/QZWL50K4Ff'] \n",
"\n",
"[u'http', u':', u'//t.co/ZzSEK3UhQL', u'For', u'sale', u'https', u':', u'//t.co/yS52UKlYqX', u'#', u'Apple', u'#', u'Startup', u'#', u'Vegas', u'#', u'bigdata', u'#', u'Microsoft', u'#', u'google', u'#', u'investor', u'#', u'UK', u'#', u'Amazon', u'#', u'Mojang'] \n",
"\n",
"[u'http', u':', u'//t.co/YFo8wxhmGt', u'For', u'sale', u'https', u':', u'//t.co/WPmD0AqH6W', u'#', u'Apple', u'#', u'Startup', u'#', u'Vegas', u'#', u'bigdata', u'#', u'Microsoft', u'#', u'google', u'#', u'investor', u'#', u'UK', u'#', u'Amazon', u'#', u'Mojang'] \n",
"\n"
]
}
],
"prompt_number": 60
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"txt = data[0]['text']\n",
"#remove_urls_usernames(txt)\n",
"\n",
"yy = remove_stopwords(remove_punctuation(remove_urls_usernames(txt)))\n",
"print yy"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"['hey', 'followers', 'join', 'cl', 'selfie', 'minecraft', 'chocker', 'pennys', 'bbw', 'mojang', 'microsoft', 'nar', 'pocketbacs']\n"
]
}
],
"prompt_number": 173
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Text processing procedure"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def remove_urls_usernames(txt):\n",
" # escape unicode characters\n",
" txt = txt.encode('utf-8').decode('unicode_escape').encode('ascii','ignore')\n",
" \n",
" regex_url = r'(http|https)://\\S+'\n",
" regex_username = r'@\\S+'\n",
"\n",
" txt_re = re.sub(regex_url, '', txt)\n",
" txt_re = re.sub(regex_unicode, '', txt_re)\n",
" txt_re = re.sub(regex_username, '', txt_re)\n",
" \n",
" return [x.lower() for x in word_tokenize(txt_re)]\n",
" #return filter(lambda x:x!='#', xxx)\n",
"\n",
"def remove_punctuation(tokenized_docs):\n",
" regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n",
"\n",
" tokenized_docs_no_punctuation = []\n",
"\n",
" for token in tokenized_docs: \n",
" new_token = regex.sub(u'', token)\n",
" if not new_token == u'':\n",
" tokenized_docs_no_punctuation.append(new_token)\n",
" \n",
" return tokenized_docs_no_punctuation\n",
"\n",
"def remove_stopwords(tokenized_docs_no_punctuation):\n",
" from nltk.corpus import stopwords\n",
" tokenized_docs_no_stopwords = []\n",
"\n",
" for word in tokenized_docs_no_punctuation:\n",
" if not word in stopwords.words('english'):\n",
" tokenized_docs_no_stopwords.append(word)\n",
" \n",
" return tokenized_docs_no_stopwords"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 171
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment