DrDanL/thameslink.py

## thameslink.py
# Import core libraries
import tweepy           # Obtain Tweets
import pandas as pd     # Store and manage Tweets
import numpy as np      # Number processing
import re

from credentials import *
from textblob import TextBlob

# Setup plotting and visualisation
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

print('Import libraries')

# API's authentication by defining a function
def twitter_setup():

    # Authentication and access using keys:
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

    # Obtain authenticated API
    api = tweepy.API(auth)
    return api

  # We create an extractor object (holding the api data) by calling in our twitter_setup() function
extractor = twitter_setup()

def get_user_tweets(api, username):
        """Return a list of all tweets from the authenticated API"""
        tweets = []
        for status in tweepy.Cursor(api.user_timeline, screen_name=username).items():
            tweets.append(status)
        return tweets

alltweets = get_user_tweets(extractor, 'TLRailUK')

print("Number of tweets extracted: {}.\n".format(len(alltweets)))

# We print the most recent 5 tweets for reference:
print("5 recent tweets:\n")
for tweet in alltweets[:5]:
    print(tweet.text)
    print()

# We create a pandas DataFrame as follows:
# Note: We loop through each element and add it to the DataFrame
data = pd.DataFrame(data=[tweet.text for tweet in alltweets], columns=['Tweets'])

# We display the first 10 elements of the DataFrame:
display(data.head(10))

print(dir(alltweets[0]))

# Print a selection of attributes from the first Tweet
print(alltweets[0].id)
print(alltweets[0].created_at)
print(alltweets[0].source)
print(alltweets[0].favorite_count)
print(alltweets[0].retweet_count)
print(alltweets[0].geo)
print(alltweets[0].coordinates)
print(alltweets[0].entities)

# Add attributes of interest
data['len']  = np.array([len(tweet.text) for tweet in alltweets])
data['ID']   = np.array([tweet.id for tweet in alltweets])
data['Date'] = np.array([tweet.created_at for tweet in alltweets])
data['Source'] = np.array([tweet.source for tweet in alltweets])
data['Likes']  = np.array([tweet.favorite_count for tweet in alltweets])
data['RTs']    = np.array([tweet.retweet_count for tweet in alltweets])

# Display of first 10 elements from DataFrame
display(data.head(10))

# We extract the mean of lenghts
mean = np.mean(data['len'])

print("The lenght's average in tweets: {}".format(mean))

# We extract the tweets which were the most favourited and retweeted
fav_max = np.max(data['Likes'])
rt_max  = np.max(data['RTs'])

fav = data[data.Likes == fav_max].index[0]
rt  = data[data.RTs == rt_max].index[0]

# Max favorited
print("The tweet with more likes is: \n{}".format(data['Tweets'][fav]))
print("Number of likes: {}".format(fav_max))
print("{} characters.\n".format(data['len'][fav]))

# Max retweet
print("The tweet with more retweets is: \n{}".format(data['Tweets'][rt]))
print("Number of retweets: {}".format(rt_max))
print("{} characters.\n".format(data['len'][rt]))

# We create time series by using length, likes and retweets
tlen = pd.Series(data=data['len'].values, index=data['Date'])
tfav = pd.Series(data=data['Likes'].values, index=data['Date'])
tret = pd.Series(data=data['RTs'].values, index=data['Date'])

# Lenghts using time
tlen.plot(figsize=(16,4), color='r');

# Likes vs retweets plot
tfav.plot(figsize=(16,4), label="Likes", legend=True)
tret.plot(figsize=(16,4), label="Retweets", legend=True);

# We obtain all possible sources from the data
sources = []
for source in data['Source']:
    if source not in sources:
        sources.append(source)

# We print the source list
print("Creation of content sources:")
for source in sources:
    print("* {}".format(source))

    # We create a numpy vector and map it to the labels
percent = np.zeros(len(sources))

for source in data['Source']:
    for index in range(len(sources)):
        if source == sources[index]:
            percent[index] += 1
            pass

percent /= 100

# Render the pie chart:
pie_chart = pd.Series(percent, index=sources, name='Sources')
pie_chart.plot.pie(fontsize=11, autopct='%.2f', figsize=(6, 6));

def clean_tweet(tweet):
    '''
    Utility function to clean the text in a Tweet by removing
    links and special characters using regex re.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def analize_sentiment(tweet):
    '''
    Utility TextBlob to classify the polarity of a Tweet
    using TextBlob.
    '''
    analysis = TextBlob(clean_tweet(tweet))
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0:
        return 0
    else:
        return -1

 # We create a column populated with the sentiment score
data['SA'] = np.array([ analize_sentiment(tweet) for tweet in data['Tweets'] ])

# We display the DataFrame with updated score
display(data.head(10))

# We determine the score for the Tweet
pos_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] > 0]
neu_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] == 0]
neg_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] < 0]

# We print percentages
print("Percentage of positive tweets: {}%".format(len(pos_tweets)*100/len(data['Tweets'])))
print("Percentage of neutral tweets: {}%".format(len(neu_tweets)*100/len(data['Tweets'])))
print("Percentage of negative tweets: {}%".format(len(neg_tweets)*100/len(data['Tweets'])))
	# Import core libraries
	import tweepy # Obtain Tweets
	import pandas as pd # Store and manage Tweets
	import numpy as np # Number processing
	import re

	from credentials import *
	from textblob import TextBlob

	# Setup plotting and visualisation
	from IPython.display import display
	import matplotlib.pyplot as plt
	import seaborn as sns
	%matplotlib inline

	print('Import libraries')

	# API's authentication by defining a function
	def twitter_setup():

	# Authentication and access using keys:
	auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
	auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

	# Obtain authenticated API
	api = tweepy.API(auth)
	return api

	# We create an extractor object (holding the api data) by calling in our twitter_setup() function
	extractor = twitter_setup()

	def get_user_tweets(api, username):
	"""Return a list of all tweets from the authenticated API"""
	tweets = []
	for status in tweepy.Cursor(api.user_timeline, screen_name=username).items():
	tweets.append(status)
	return tweets

	alltweets = get_user_tweets(extractor, 'TLRailUK')

	print("Number of tweets extracted: {}.\n".format(len(alltweets)))

	# We print the most recent 5 tweets for reference:
	print("5 recent tweets:\n")
	for tweet in alltweets[:5]:
	print(tweet.text)
	print()

	# We create a pandas DataFrame as follows:
	# Note: We loop through each element and add it to the DataFrame
	data = pd.DataFrame(data=[tweet.text for tweet in alltweets], columns=['Tweets'])

	# We display the first 10 elements of the DataFrame:
	display(data.head(10))

	print(dir(alltweets[0]))

	# Print a selection of attributes from the first Tweet
	print(alltweets[0].id)
	print(alltweets[0].created_at)
	print(alltweets[0].source)
	print(alltweets[0].favorite_count)
	print(alltweets[0].retweet_count)
	print(alltweets[0].geo)
	print(alltweets[0].coordinates)
	print(alltweets[0].entities)

	# Add attributes of interest
	data['len'] = np.array([len(tweet.text) for tweet in alltweets])
	data['ID'] = np.array([tweet.id for tweet in alltweets])
	data['Date'] = np.array([tweet.created_at for tweet in alltweets])
	data['Source'] = np.array([tweet.source for tweet in alltweets])
	data['Likes'] = np.array([tweet.favorite_count for tweet in alltweets])
	data['RTs'] = np.array([tweet.retweet_count for tweet in alltweets])

	# Display of first 10 elements from DataFrame
	display(data.head(10))

	# We extract the mean of lenghts
	mean = np.mean(data['len'])

	print("The lenght's average in tweets: {}".format(mean))

	# We extract the tweets which were the most favourited and retweeted
	fav_max = np.max(data['Likes'])
	rt_max = np.max(data['RTs'])

	fav = data[data.Likes == fav_max].index[0]
	rt = data[data.RTs == rt_max].index[0]

	# Max favorited
	print("The tweet with more likes is: \n{}".format(data['Tweets'][fav]))
	print("Number of likes: {}".format(fav_max))
	print("{} characters.\n".format(data['len'][fav]))

	# Max retweet
	print("The tweet with more retweets is: \n{}".format(data['Tweets'][rt]))
	print("Number of retweets: {}".format(rt_max))
	print("{} characters.\n".format(data['len'][rt]))

	# We create time series by using length, likes and retweets
	tlen = pd.Series(data=data['len'].values, index=data['Date'])
	tfav = pd.Series(data=data['Likes'].values, index=data['Date'])
	tret = pd.Series(data=data['RTs'].values, index=data['Date'])

	# Lenghts using time
	tlen.plot(figsize=(16,4), color='r');

	# Likes vs retweets plot
	tfav.plot(figsize=(16,4), label="Likes", legend=True)
	tret.plot(figsize=(16,4), label="Retweets", legend=True);

	# We obtain all possible sources from the data
	sources = []
	for source in data['Source']:
	if source not in sources:
	sources.append(source)

	# We print the source list
	print("Creation of content sources:")
	for source in sources:
	print("* {}".format(source))

	# We create a numpy vector and map it to the labels
	percent = np.zeros(len(sources))

	for source in data['Source']:
	for index in range(len(sources)):
	if source == sources[index]:
	percent[index] += 1
	pass

	percent /= 100

	# Render the pie chart:
	pie_chart = pd.Series(percent, index=sources, name='Sources')
	pie_chart.plot.pie(fontsize=11, autopct='%.2f', figsize=(6, 6));

	def clean_tweet(tweet):
	'''
	Utility function to clean the text in a Tweet by removing
	links and special characters using regex re.
	'''
	return ' '.join(re.sub("(@[A-Za-z0-9]+)\|([^0-9A-Za-z \t])\|(\w+:\/\/\S+)", " ", tweet).split())

	def analize_sentiment(tweet):
	'''
	Utility TextBlob to classify the polarity of a Tweet
	using TextBlob.
	'''
	analysis = TextBlob(clean_tweet(tweet))
	if analysis.sentiment.polarity > 0:
	return 1
	elif analysis.sentiment.polarity == 0:
	return 0
	else:
	return -1

	# We create a column populated with the sentiment score
	data['SA'] = np.array([ analize_sentiment(tweet) for tweet in data['Tweets'] ])

	# We display the DataFrame with updated score
	display(data.head(10))

	# We determine the score for the Tweet
	pos_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] > 0]
	neu_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] == 0]
	neg_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] < 0]

	# We print percentages
	print("Percentage of positive tweets: {}%".format(len(pos_tweets)*100/len(data['Tweets'])))
	print("Percentage of neutral tweets: {}%".format(len(neu_tweets)*100/len(data['Tweets'])))
	print("Percentage of negative tweets: {}%".format(len(neg_tweets)*100/len(data['Tweets'])))