Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Collect tweets about SMRT Stations
#!/usr/bin/python
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import re
import sys
import json
import dateutil.parser
from pytz import timezone
import pytz
# The consumer keys can be found on your application's Details
# page located at https://dev.twitter.com/apps (under "OAuth settings")
CONSUMER_KEY = 'XXXXXXXXXXXXXXXXXXXXXX'
CONSUMER_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
# The access tokens can be found on your applications's Details
# page located at https://dev.twitter.com/apps (located
# under "Your access token")
ACCESS_TOKEN = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
ACCESS_TOKEN_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
sgtz = timezone('Asia/Singapore')
utc = pytz.timezone('UTC')
STATIONS = [
'Admiralty MRT',
'Aljunied MRT',
'Ang Mo Kio MRT',
'Bartley MRT',
'Bayfront MRT',
'Bedok MRT',
'Bishan MRT',
'Bras Basah MRT',
'Botanic Gardens MRT',
'Braddell MRT',
'Bukit Batok MRT',
'Bukit Gombak MRT',
'Caldecott MRT',
'Choa Chu Kang MRT',
'Boon Keng MRT',
'Boon Lay MRT',
'Buangkok MRT',
'Bugis MRT',
'Buona Vista MRT',
'Changi Airport MRT',
'Chinatown MRT',
'Clarke Quay MRT',
'Chinese Garden MRT',
'City Hall MRT',
'Clementi MRT',
'Commonwealth MRT',
'Dakota MRT',
'Dhoby Ghaut MRT',
'Dover MRT',
'Esplanade MRT',
'Eunos MRT',
'Expo MRT',
'Farrer Park MRT',
'Farrer Road MRT',
'HarbourFront MRT',
'Haw Par Villa MRT',
'Holland Village MRT',
'Hougang MRT',
'Joo Koon MRT',
'Jurong East MRT',
'Kallang MRT',
'Kovan MRT',
'Kembangan MRT',
'Kent Ridge MRT',
'Khatib MRT',
'Kranji MRT',
'Lakeside MRT',
'Labrador Park MRT',
'Lavender MRT',
'Little India MRT',
'Lorong Chuan MRT',
'Marina Bay MRT',
'Marsiling MRT',
'MacPherson MRT',
'Marymount MRT',
'Mountbatten MRT',
'Newton MRT',
'Nicoll Highway MRT',
'one-north MRT',
'Novena MRT',
'Orchard MRT',
'Outram Park MRT',
'Pasir Ris MRT',
'Pasir Panjang MRT',
'Paya Lebar MRT',
'Pioneer MRT',
'Potong Pasir MRT',
'Promenade MRT',
'Punggol MRT',
'Queenstown MRT',
'Raffles Place MRT',
'Redhill MRT',
'Sembawang MRT',
'Sengkang MRT',
'Serangoon MRT',
'Simei MRT',
'Somerset MRT',
'Stadium MRT',
'Tampines MRT',
'Tai Seng MRT',
'Tanah Merah MRT',
'Tanjong Pagar MRT',
'Tiong Bahru MRT',
'Telok Blangah MRT',
'Toa Payoh MRT',
'Woodlands MRT',
'Woodleigh MRT',
'Yew Tree MRT',
'Yio Chu Kang MRT',
'Yishun MRT'
]
regex = re.compile('|'.join(STATIONS).lower())
linenum_re = re.compile(r'([A-Z][A-Z]\d+)')
retweets_re = re.compile(r'^RT\s')
enc = lambda x: x.encode('latin1', errors='ignore')
class StdOutListener(StreamListener):
def on_data(self, data):
tweet = json.loads(data)
if not tweet.has_key('user'):
print 'No user data - ignoring tweet.'
return True
user = enc(tweet['user']['name'])
text = enc(tweet['text'])
# ignore text that doesn't contain one of the keywords
matches = re.search(regex, text.lower())
if not matches:
return True
# ignore retweets
if re.search(retweets_re, text):
return True
location = enc(tweet['user']['location'])
source = enc(tweet['source'])
d = dateutil.parser.parse(enc(tweet['created_at']))
# localize time
d_tz = utc.normalize(d)
localtime = d.astimezone(sgtz)
tmstr = localtime.strftime("%Y%m%d-%H:%M:%S")
# append the hourly tweet file
with open('tweets-%s.data' % tmstr.split(':')[0], 'a+') as f:
f.write(data)
# is this a geocoded tweet?
geo = tweet['geo']
if geo and geo['type'] == 'Point':
# collect location of mrt station
coords = geo['coordinates']
ln = re.search(linenum_re, text)
if ln:
with open('mrt_station_locations.csv', 'a+') as mrtgeo:
print("Found geo coords for MRT Station (%s) '%s': (%f, %f)\n" %
(ln.group(), matches.group(), coords[1], coords[0]))
mrtgeo.write("%f\t%f\t%s\t%s\n" %
(coords[1], coords[0], matches.group(), ln.group()))
# print summary of tweet
print('%s\n%s\n%s\n%s\n%s\n\n ----------------\n' % (user, location, source, tmstr, text))
return True
def on_error(self, status):
print('status: %s' % status)
if __name__ == '__main__':
l = StdOutListener()
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
stream = Stream(auth, l, timeout=60)
print("Listening to filter stream...")
stream.filter(track=STATIONS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment