Last active
August 29, 2015 14:01
-
-
Save danielsamuels/cd5945991ce3af0d7216 to your computer and use it in GitHub Desktop.
BBC Twitter analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A Python script to pull out all Tweets from the main BBC accounts tweeting about | |
the European Elections 2014. The aim is to analyse the data to see which | |
political party was tweeted about the most. | |
Data collection: | |
Per account, without retweets, where the content matches a keyword. | |
Per account, with retweets, where the content matches a keyword. | |
Data format: | |
Account | Tweet ID | Date | Content | Mentions UKIP | Mentions Conservative | Mentions Libdem | Mentions Labour | Mentions Green | |
""" | |
from twython import Twython | |
import csv | |
import datetime | |
import re | |
import time | |
class DataCollector(object): | |
start_date = datetime.date(2014, 03, 01) | |
bbc_accounts = [ | |
'bbcnews', | |
'bbcbreaking', | |
'BBCPolitics', | |
'bbcworld', | |
'bbcengland' | |
] | |
twitter = Twython( | |
APP_KEY, | |
APP_SECRET, | |
OAUTH_KEY, | |
OAUTH_SECRET | |
) | |
keywords = { | |
'ukip': ['ukip', 'farage'], | |
'conservative': ['conservatives', 'cameron', 'tory', 'tories'], | |
'libdem': ['liberal', 'democrats', 'lib dem', 'clegg'], | |
'labour': ['labour', 'miliband'], | |
'green': ['green', 'bennett'] | |
} | |
tweets = {} | |
last_id = None | |
def __init__(self): | |
""" | |
for account in self.bbc_accounts: | |
# Without retweets. | |
self._get_timeline(account, False) | |
self._write_csv('no_rts') | |
self.tweets = [] | |
""" | |
for account in self.bbc_accounts: | |
# With retweets. | |
self._get_timeline(account, True) | |
self._write_csv('rts') | |
print 'All done!' | |
def _get_timeline(self, account, include_rts, max_id=None): | |
print 'Getting timeline for {} with {}max ID{} and {}retweets'.format( | |
account, | |
'' if max_id else 'no ', | |
' ' + max_id if max_id else '', | |
'no ' if include_rts is False else '' | |
) | |
params = { | |
'screen_name': account, | |
'trim_user': True, | |
'exclude_replies': True, | |
'include_rts': include_rts, | |
'count': 200 | |
} | |
if max_id: | |
params['max_id'] = max_id | |
try: | |
timeline = self.twitter.get_user_timeline(**params) | |
except: | |
reset_time = float(self.twitter.get_lastfunction_header('X-Rate-Limit-Reset')) | |
remaining = divmod(reset_time - time.time(), 60) | |
print "Hit rate limit. Try again in {} minutes and {} seconds.".format( | |
int(remaining[0]), | |
int(remaining[1]) | |
) | |
exit() | |
return | |
max_id, created_at = self._process_timeline(account, timeline) | |
if max_id is None and created_at is None: | |
print 'Timeline empty.' | |
return False | |
parsed_date = time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y') | |
parsed_date = datetime.date(parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday) | |
# print parsed_date < self.start_date | |
if self.start_date < parsed_date: | |
return self._get_timeline(account, include_rts=include_rts, max_id=max_id) | |
else: | |
print 'All done.' | |
return True | |
def _write_csv(self, name): | |
# Output the tweets as a csv. | |
with open(name + '.csv', 'wb') as f: | |
writer = csv.writer(f) | |
writer.writerow(['Account', 'Tweet ID', 'Date', 'Content', 'Mentions UKIP?', 'Mentions Conservative?', 'Mentions Lib Dem?', 'Mentions Labour?', 'Mentions Green?']) | |
for x in self.tweets: | |
tweet = self.tweets[x] | |
# If the tweet isn't political, don't include it. | |
if tweet['ukip'] == 'No' and tweet['conservative'] == 'No' and tweet['libdem'] == 'No' and tweet['labour'] == 'No' and tweet['green'] == 'No': | |
continue | |
writer.writerow([ | |
tweet['account'], | |
tweet['tweet_id'], | |
tweet['date'], | |
tweet['content'].encode("utf-8"), | |
tweet['ukip'], | |
tweet['conservative'], | |
tweet['libdem'], | |
tweet['labour'], | |
tweet['green'], | |
]) | |
def _process_timeline(self, account, timeline): | |
for tweet in timeline: | |
parsed_date = time.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y') | |
try: | |
# Store the tweet. | |
self.tweets[tweet['id_str']] = { | |
'account': account, | |
'tweet_id': tweet['id_str'], | |
'date': '{day}/{month}/{year}'.format( | |
day=parsed_date.tm_mday, | |
month=parsed_date.tm_mon, | |
year=parsed_date.tm_year | |
), | |
'content': tweet['text'], | |
'ukip': self._party_mentioned('ukip', tweet['text']), | |
'conservative': self._party_mentioned('conservative', tweet['text']), | |
'libdem': self._party_mentioned('libdem', tweet['text']), | |
'labour': self._party_mentioned('labour', tweet['text']), | |
'green': self._party_mentioned('green', tweet['text']) | |
} | |
except Exception as e: | |
print e, tweet | |
# Return the last id and date | |
try: | |
# If the next last id matches the last one, just stop. | |
if timeline[-1]['id_str'] == self.last_id: | |
return (None, None) | |
self.last_id = timeline[-1]['id_str'] | |
return (timeline[-1]['id_str'], timeline[-1]['created_at']) | |
except Exception as e: | |
print timeline | |
print e | |
return (None, None) | |
def _party_mentioned(self, party, tweet): | |
for keyword in self.keywords[party]: | |
if re.search(r'\b%ss?\b' % keyword, tweet, flags=re.IGNORECASE): | |
return 'Yes' | |
return 'No' | |
if __name__ == '__main__': | |
DataCollector() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment