Created
June 15, 2018 16:41
-
-
Save DrDanL/cedba01aabf53490d85c5d0999d9f7c5 to your computer and use it in GitHub Desktop.
This is a Gist for using Python 3 to perform NLP. This is from the blog post entitled Sentiment analysis of Thameslink Tweets using Python 3. See http://leightley.com/sentiment-analysis-of-thameslink-tweets-using-python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import core libraries | |
import tweepy # Obtain Tweets | |
import pandas as pd # Store and manage Tweets | |
import numpy as np # Number processing | |
import re | |
from credentials import * | |
from textblob import TextBlob | |
# Setup plotting and visualisation | |
from IPython.display import display | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
%matplotlib inline | |
print('Import libraries') | |
# API's authentication by defining a function | |
def twitter_setup(): | |
# Authentication and access using keys: | |
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) | |
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET) | |
# Obtain authenticated API | |
api = tweepy.API(auth) | |
return api | |
# We create an extractor object (holding the api data) by calling in our twitter_setup() function | |
extractor = twitter_setup() | |
def get_user_tweets(api, username): | |
"""Return a list of all tweets from the authenticated API""" | |
tweets = [] | |
for status in tweepy.Cursor(api.user_timeline, screen_name=username).items(): | |
tweets.append(status) | |
return tweets | |
alltweets = get_user_tweets(extractor, 'TLRailUK') | |
print("Number of tweets extracted: {}.\n".format(len(alltweets))) | |
# We print the most recent 5 tweets for reference: | |
print("5 recent tweets:\n") | |
for tweet in alltweets[:5]: | |
print(tweet.text) | |
print() | |
# We create a pandas DataFrame as follows: | |
# Note: We loop through each element and add it to the DataFrame | |
data = pd.DataFrame(data=[tweet.text for tweet in alltweets], columns=['Tweets']) | |
# We display the first 10 elements of the DataFrame: | |
display(data.head(10)) | |
print(dir(alltweets[0])) | |
# Print a selection of attributes from the first Tweet | |
print(alltweets[0].id) | |
print(alltweets[0].created_at) | |
print(alltweets[0].source) | |
print(alltweets[0].favorite_count) | |
print(alltweets[0].retweet_count) | |
print(alltweets[0].geo) | |
print(alltweets[0].coordinates) | |
print(alltweets[0].entities) | |
# Add attributes of interest | |
data['len'] = np.array([len(tweet.text) for tweet in alltweets]) | |
data['ID'] = np.array([tweet.id for tweet in alltweets]) | |
data['Date'] = np.array([tweet.created_at for tweet in alltweets]) | |
data['Source'] = np.array([tweet.source for tweet in alltweets]) | |
data['Likes'] = np.array([tweet.favorite_count for tweet in alltweets]) | |
data['RTs'] = np.array([tweet.retweet_count for tweet in alltweets]) | |
# Display of first 10 elements from DataFrame | |
display(data.head(10)) | |
# We extract the mean of lenghts | |
mean = np.mean(data['len']) | |
print("The lenght's average in tweets: {}".format(mean)) | |
# We extract the tweets which were the most favourited and retweeted | |
fav_max = np.max(data['Likes']) | |
rt_max = np.max(data['RTs']) | |
fav = data[data.Likes == fav_max].index[0] | |
rt = data[data.RTs == rt_max].index[0] | |
# Max favorited | |
print("The tweet with more likes is: \n{}".format(data['Tweets'][fav])) | |
print("Number of likes: {}".format(fav_max)) | |
print("{} characters.\n".format(data['len'][fav])) | |
# Max retweet | |
print("The tweet with more retweets is: \n{}".format(data['Tweets'][rt])) | |
print("Number of retweets: {}".format(rt_max)) | |
print("{} characters.\n".format(data['len'][rt])) | |
# We create time series by using length, likes and retweets | |
tlen = pd.Series(data=data['len'].values, index=data['Date']) | |
tfav = pd.Series(data=data['Likes'].values, index=data['Date']) | |
tret = pd.Series(data=data['RTs'].values, index=data['Date']) | |
# Lenghts using time | |
tlen.plot(figsize=(16,4), color='r'); | |
# Likes vs retweets plot | |
tfav.plot(figsize=(16,4), label="Likes", legend=True) | |
tret.plot(figsize=(16,4), label="Retweets", legend=True); | |
# We obtain all possible sources from the data | |
sources = [] | |
for source in data['Source']: | |
if source not in sources: | |
sources.append(source) | |
# We print the source list | |
print("Creation of content sources:") | |
for source in sources: | |
print("* {}".format(source)) | |
# We create a numpy vector and map it to the labels | |
percent = np.zeros(len(sources)) | |
for source in data['Source']: | |
for index in range(len(sources)): | |
if source == sources[index]: | |
percent[index] += 1 | |
pass | |
percent /= 100 | |
# Render the pie chart: | |
pie_chart = pd.Series(percent, index=sources, name='Sources') | |
pie_chart.plot.pie(fontsize=11, autopct='%.2f', figsize=(6, 6)); | |
def clean_tweet(tweet): | |
''' | |
Utility function to clean the text in a Tweet by removing | |
links and special characters using regex re. | |
''' | |
return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) | |
def analize_sentiment(tweet): | |
''' | |
Utility TextBlob to classify the polarity of a Tweet | |
using TextBlob. | |
''' | |
analysis = TextBlob(clean_tweet(tweet)) | |
if analysis.sentiment.polarity > 0: | |
return 1 | |
elif analysis.sentiment.polarity == 0: | |
return 0 | |
else: | |
return -1 | |
# We create a column populated with the sentiment score | |
data['SA'] = np.array([ analize_sentiment(tweet) for tweet in data['Tweets'] ]) | |
# We display the DataFrame with updated score | |
display(data.head(10)) | |
# We determine the score for the Tweet | |
pos_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] > 0] | |
neu_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] == 0] | |
neg_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] < 0] | |
# We print percentages | |
print("Percentage of positive tweets: {}%".format(len(pos_tweets)*100/len(data['Tweets']))) | |
print("Percentage of neutral tweets: {}%".format(len(neu_tweets)*100/len(data['Tweets']))) | |
print("Percentage of negative tweets: {}%".format(len(neg_tweets)*100/len(data['Tweets']))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment