Skip to content

Instantly share code, notes, and snippets.

@danielsamuels
Last active August 29, 2015 14:01
Show Gist options
  • Save danielsamuels/cd5945991ce3af0d7216 to your computer and use it in GitHub Desktop.
Save danielsamuels/cd5945991ce3af0d7216 to your computer and use it in GitHub Desktop.
BBC Twitter analysis
"""
A Python script to pull out all Tweets from the main BBC accounts tweeting about
the European Elections 2014. The aim is to analyse the data to see which
political party was tweeted about the most.
Data collection:
Per account, without retweets, where the content matches a keyword.
Per account, with retweets, where the content matches a keyword.
Data format:
Account | Tweet ID | Date | Content | Mentions UKIP | Mentions Conservative | Mentions Libdem | Mentions Labour | Mentions Green
"""
from twython import Twython
import csv
import datetime
import re
import time
class DataCollector(object):
start_date = datetime.date(2014, 03, 01)
bbc_accounts = [
'bbcnews',
'bbcbreaking',
'BBCPolitics',
'bbcworld',
'bbcengland'
]
twitter = Twython(
APP_KEY,
APP_SECRET,
OAUTH_KEY,
OAUTH_SECRET
)
keywords = {
'ukip': ['ukip', 'farage'],
'conservative': ['conservatives', 'cameron', 'tory', 'tories'],
'libdem': ['liberal', 'democrats', 'lib dem', 'clegg'],
'labour': ['labour', 'miliband'],
'green': ['green', 'bennett']
}
tweets = {}
last_id = None
def __init__(self):
"""
for account in self.bbc_accounts:
# Without retweets.
self._get_timeline(account, False)
self._write_csv('no_rts')
self.tweets = []
"""
for account in self.bbc_accounts:
# With retweets.
self._get_timeline(account, True)
self._write_csv('rts')
print 'All done!'
def _get_timeline(self, account, include_rts, max_id=None):
print 'Getting timeline for {} with {}max ID{} and {}retweets'.format(
account,
'' if max_id else 'no ',
' ' + max_id if max_id else '',
'no ' if include_rts is False else ''
)
params = {
'screen_name': account,
'trim_user': True,
'exclude_replies': True,
'include_rts': include_rts,
'count': 200
}
if max_id:
params['max_id'] = max_id
try:
timeline = self.twitter.get_user_timeline(**params)
except:
reset_time = float(self.twitter.get_lastfunction_header('X-Rate-Limit-Reset'))
remaining = divmod(reset_time - time.time(), 60)
print "Hit rate limit. Try again in {} minutes and {} seconds.".format(
int(remaining[0]),
int(remaining[1])
)
exit()
return
max_id, created_at = self._process_timeline(account, timeline)
if max_id is None and created_at is None:
print 'Timeline empty.'
return False
parsed_date = time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y')
parsed_date = datetime.date(parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday)
# print parsed_date < self.start_date
if self.start_date < parsed_date:
return self._get_timeline(account, include_rts=include_rts, max_id=max_id)
else:
print 'All done.'
return True
def _write_csv(self, name):
# Output the tweets as a csv.
with open(name + '.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(['Account', 'Tweet ID', 'Date', 'Content', 'Mentions UKIP?', 'Mentions Conservative?', 'Mentions Lib Dem?', 'Mentions Labour?', 'Mentions Green?'])
for x in self.tweets:
tweet = self.tweets[x]
# If the tweet isn't political, don't include it.
if tweet['ukip'] == 'No' and tweet['conservative'] == 'No' and tweet['libdem'] == 'No' and tweet['labour'] == 'No' and tweet['green'] == 'No':
continue
writer.writerow([
tweet['account'],
tweet['tweet_id'],
tweet['date'],
tweet['content'].encode("utf-8"),
tweet['ukip'],
tweet['conservative'],
tweet['libdem'],
tweet['labour'],
tweet['green'],
])
def _process_timeline(self, account, timeline):
for tweet in timeline:
parsed_date = time.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
try:
# Store the tweet.
self.tweets[tweet['id_str']] = {
'account': account,
'tweet_id': tweet['id_str'],
'date': '{day}/{month}/{year}'.format(
day=parsed_date.tm_mday,
month=parsed_date.tm_mon,
year=parsed_date.tm_year
),
'content': tweet['text'],
'ukip': self._party_mentioned('ukip', tweet['text']),
'conservative': self._party_mentioned('conservative', tweet['text']),
'libdem': self._party_mentioned('libdem', tweet['text']),
'labour': self._party_mentioned('labour', tweet['text']),
'green': self._party_mentioned('green', tweet['text'])
}
except Exception as e:
print e, tweet
# Return the last id and date
try:
# If the next last id matches the last one, just stop.
if timeline[-1]['id_str'] == self.last_id:
return (None, None)
self.last_id = timeline[-1]['id_str']
return (timeline[-1]['id_str'], timeline[-1]['created_at'])
except Exception as e:
print timeline
print e
return (None, None)
def _party_mentioned(self, party, tweet):
for keyword in self.keywords[party]:
if re.search(r'\b%ss?\b' % keyword, tweet, flags=re.IGNORECASE):
return 'Yes'
return 'No'
if __name__ == '__main__':
DataCollector()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment