danielsamuels/main.py

## main.py
"""
A Python script to pull out all Tweets from the main BBC accounts tweeting about
the European Elections 2014.  The aim is to analyse the data to see which
political party was tweeted about the most.

Data collection:
    Per account, without retweets, where the content matches a keyword.
    Per account, with retweets, where the content matches a keyword.

Data format:
Account | Tweet ID | Date | Content | Mentions UKIP | Mentions Conservative | Mentions Libdem | Mentions Labour | Mentions Green
"""

from twython import Twython

import csv
import datetime
import re
import time


class DataCollector(object):

    start_date = datetime.date(2014, 03, 01)

    bbc_accounts = [
        'bbcnews',
        'bbcbreaking',
        'BBCPolitics',
        'bbcworld',
        'bbcengland'
    ]

    twitter = Twython(
        APP_KEY,
        APP_SECRET,
        OAUTH_KEY,
        OAUTH_SECRET
    )

    keywords = {
        'ukip': ['ukip', 'farage'],
        'conservative': ['conservatives', 'cameron', 'tory', 'tories'],
        'libdem': ['liberal', 'democrats', 'lib dem', 'clegg'],
        'labour': ['labour', 'miliband'],
        'green': ['green', 'bennett']
    }

    tweets = {}
    last_id = None

    def __init__(self):
        """
        for account in self.bbc_accounts:
            # Without retweets.
            self._get_timeline(account, False)

        self._write_csv('no_rts')
        self.tweets = []
        """

        for account in self.bbc_accounts:
            # With retweets.
            self._get_timeline(account, True)

        self._write_csv('rts')

        print 'All done!'

    def _get_timeline(self, account, include_rts, max_id=None):
        print 'Getting timeline for {} with {}max ID{} and {}retweets'.format(
            account,
            '' if max_id else 'no ',
            ' ' + max_id if max_id else '',
            'no ' if include_rts is False else ''
        )

        params = {
            'screen_name': account,
            'trim_user': True,
            'exclude_replies': True,
            'include_rts': include_rts,
            'count': 200
        }

        if max_id:
            params['max_id'] = max_id

        try:
            timeline = self.twitter.get_user_timeline(**params)
        except:
            reset_time = float(self.twitter.get_lastfunction_header('X-Rate-Limit-Reset'))

            remaining = divmod(reset_time - time.time(), 60)

            print "Hit rate limit. Try again in {} minutes and {} seconds.".format(
                int(remaining[0]),
                int(remaining[1])
            )

            exit()
            return

        max_id, created_at = self._process_timeline(account, timeline)

        if max_id is None and created_at is None:
            print 'Timeline empty.'
            return False

        parsed_date = time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y')
        parsed_date = datetime.date(parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday)

        # print parsed_date < self.start_date
        if self.start_date < parsed_date:
            return self._get_timeline(account, include_rts=include_rts, max_id=max_id)
        else:
            print 'All done.'
            return True

    def _write_csv(self, name):
        # Output the tweets as a csv.
        with open(name + '.csv', 'wb') as f:
            writer = csv.writer(f)

            writer.writerow(['Account', 'Tweet ID', 'Date', 'Content', 'Mentions UKIP?', 'Mentions Conservative?', 'Mentions Lib Dem?', 'Mentions Labour?', 'Mentions Green?'])

            for x in self.tweets:
                tweet = self.tweets[x]

                # If the tweet isn't political, don't include it.
                if tweet['ukip'] == 'No' and tweet['conservative'] == 'No' and tweet['libdem'] == 'No' and tweet['labour'] == 'No' and tweet['green'] == 'No':
                    continue

                writer.writerow([
                    tweet['account'],
                    tweet['tweet_id'],
                    tweet['date'],
                    tweet['content'].encode("utf-8"),
                    tweet['ukip'],
                    tweet['conservative'],
                    tweet['libdem'],
                    tweet['labour'],
                    tweet['green'],
                ])

    def _process_timeline(self, account, timeline):
        for tweet in timeline:
            parsed_date = time.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y')

            try:
                # Store the tweet.
                self.tweets[tweet['id_str']] = {
                    'account': account,
                    'tweet_id': tweet['id_str'],
                    'date': '{day}/{month}/{year}'.format(
                        day=parsed_date.tm_mday,
                        month=parsed_date.tm_mon,
                        year=parsed_date.tm_year
                    ),
                    'content': tweet['text'],
                    'ukip': self._party_mentioned('ukip', tweet['text']),
                    'conservative': self._party_mentioned('conservative', tweet['text']),
                    'libdem': self._party_mentioned('libdem', tweet['text']),
                    'labour': self._party_mentioned('labour', tweet['text']),
                    'green': self._party_mentioned('green', tweet['text'])
                }
            except Exception as e:
                print e, tweet

        # Return the last id and date
        try:
            # If the next last id matches the last one, just stop.
            if timeline[-1]['id_str'] == self.last_id:
                return (None, None)

            self.last_id = timeline[-1]['id_str']

            return (timeline[-1]['id_str'], timeline[-1]['created_at'])
        except Exception as e:
            print timeline
            print e
            return (None, None)

    def _party_mentioned(self, party, tweet):
        for keyword in self.keywords[party]:
            if re.search(r'\b%ss?\b' % keyword, tweet, flags=re.IGNORECASE):
                return 'Yes'
        return 'No'

if __name__ == '__main__':
    DataCollector()
	"""
	A Python script to pull out all Tweets from the main BBC accounts tweeting about
	the European Elections 2014. The aim is to analyse the data to see which
	political party was tweeted about the most.

	Data collection:
	Per account, without retweets, where the content matches a keyword.
	Per account, with retweets, where the content matches a keyword.

	Data format:
	Account \| Tweet ID \| Date \| Content \| Mentions UKIP \| Mentions Conservative \| Mentions Libdem \| Mentions Labour \| Mentions Green
	"""

	from twython import Twython

	import csv
	import datetime
	import re
	import time


	class DataCollector(object):

	start_date = datetime.date(2014, 03, 01)

	bbc_accounts = [
	'bbcnews',
	'bbcbreaking',
	'BBCPolitics',
	'bbcworld',
	'bbcengland'
	]

	twitter = Twython(
	APP_KEY,
	APP_SECRET,
	OAUTH_KEY,
	OAUTH_SECRET
	)

	keywords = {
	'ukip': ['ukip', 'farage'],
	'conservative': ['conservatives', 'cameron', 'tory', 'tories'],
	'libdem': ['liberal', 'democrats', 'lib dem', 'clegg'],
	'labour': ['labour', 'miliband'],
	'green': ['green', 'bennett']
	}

	tweets = {}
	last_id = None

	def __init__(self):
	"""
	for account in self.bbc_accounts:
	# Without retweets.
	self._get_timeline(account, False)

	self._write_csv('no_rts')
	self.tweets = []
	"""

	for account in self.bbc_accounts:
	# With retweets.
	self._get_timeline(account, True)

	self._write_csv('rts')

	print 'All done!'

	def _get_timeline(self, account, include_rts, max_id=None):
	print 'Getting timeline for {} with {}max ID{} and {}retweets'.format(
	account,
	'' if max_id else 'no ',
	' ' + max_id if max_id else '',
	'no ' if include_rts is False else ''
	)

	params = {
	'screen_name': account,
	'trim_user': True,
	'exclude_replies': True,
	'include_rts': include_rts,
	'count': 200
	}

	if max_id:
	params['max_id'] = max_id

	try:
	timeline = self.twitter.get_user_timeline(**params)
	except:
	reset_time = float(self.twitter.get_lastfunction_header('X-Rate-Limit-Reset'))

	remaining = divmod(reset_time - time.time(), 60)

	print "Hit rate limit. Try again in {} minutes and {} seconds.".format(
	int(remaining[0]),
	int(remaining[1])
	)

	exit()
	return

	max_id, created_at = self._process_timeline(account, timeline)

	if max_id is None and created_at is None:
	print 'Timeline empty.'
	return False

	parsed_date = time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y')
	parsed_date = datetime.date(parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday)

	# print parsed_date < self.start_date
	if self.start_date < parsed_date:
	return self._get_timeline(account, include_rts=include_rts, max_id=max_id)
	else:
	print 'All done.'
	return True

	def _write_csv(self, name):
	# Output the tweets as a csv.
	with open(name + '.csv', 'wb') as f:
	writer = csv.writer(f)

	writer.writerow(['Account', 'Tweet ID', 'Date', 'Content', 'Mentions UKIP?', 'Mentions Conservative?', 'Mentions Lib Dem?', 'Mentions Labour?', 'Mentions Green?'])

	for x in self.tweets:
	tweet = self.tweets[x]

	# If the tweet isn't political, don't include it.
	if tweet['ukip'] == 'No' and tweet['conservative'] == 'No' and tweet['libdem'] == 'No' and tweet['labour'] == 'No' and tweet['green'] == 'No':
	continue

	writer.writerow([
	tweet['account'],
	tweet['tweet_id'],
	tweet['date'],
	tweet['content'].encode("utf-8"),
	tweet['ukip'],
	tweet['conservative'],
	tweet['libdem'],
	tweet['labour'],
	tweet['green'],
	])

	def _process_timeline(self, account, timeline):
	for tweet in timeline:
	parsed_date = time.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y')

	try:
	# Store the tweet.
	self.tweets[tweet['id_str']] = {
	'account': account,
	'tweet_id': tweet['id_str'],
	'date': '{day}/{month}/{year}'.format(
	day=parsed_date.tm_mday,
	month=parsed_date.tm_mon,
	year=parsed_date.tm_year
	),
	'content': tweet['text'],
	'ukip': self._party_mentioned('ukip', tweet['text']),
	'conservative': self._party_mentioned('conservative', tweet['text']),
	'libdem': self._party_mentioned('libdem', tweet['text']),
	'labour': self._party_mentioned('labour', tweet['text']),
	'green': self._party_mentioned('green', tweet['text'])
	}
	except Exception as e:
	print e, tweet

	# Return the last id and date
	try:
	# If the next last id matches the last one, just stop.
	if timeline[-1]['id_str'] == self.last_id:
	return (None, None)

	self.last_id = timeline[-1]['id_str']

	return (timeline[-1]['id_str'], timeline[-1]['created_at'])
	except Exception as e:
	print timeline
	print e
	return (None, None)

	def _party_mentioned(self, party, tweet):
	for keyword in self.keywords[party]:
	if re.search(r'\b%ss?\b' % keyword, tweet, flags=re.IGNORECASE):
	return 'Yes'
	return 'No'

	if __name__ == '__main__':
	DataCollector()