Skip to content

Instantly share code, notes, and snippets.

@NewscatcherAPI
Last active November 17, 2021 09:31
Show Gist options
  • Save NewscatcherAPI/b5103a8e4542b45cdd1f43658a0eb1f6 to your computer and use it in GitHub Desktop.
Save NewscatcherAPI/b5103a8e4542b45cdd1f43658a0eb1f6 to your computer and use it in GitHub Desktop.
Sentiment analysis
def clean_text(text, all_mentions):
# If retweet, delete RT and name of the account
text = re.sub('(RT\s.*):', '', text)
# Find all links and delete them
all_links = re.findall('(https:.*?)\s', text + ' ')
for i in all_links:
text = text.replace(i, '')
for i in all_mentions:
text = text.replace('@' + i, '')
# Tokens
tokens = word_tokenize(text.replace('-', ' '))
# convert to lower case
tokens = [w.lower() for w in tokens ]
# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
phrase = " ".join(words)
return phrase, all_links
for i in results_apple:
i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']])
for i in results_facebook:
i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']])
for i in results_amazon:
i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']])
results_apple[0]
variables_we_need = ['created_at', 'id', 'full_text', 'entities', 'user', 'coordinates', 'retweet_count', 'favorite_count', 'lang']
def get_all_tweets(count=100, q='', lang='', since='', tweet_mode='extended'):
results = []
tweets = tweepy.Cursor(api.search, q=q, lang=lang, since=since, tweet_mode=tweet_mode).items(count)
for tweet in tweets:
d = {}
for variable in variables_we_need:
d[variable] = tweet._json[variable]
results.append(d)
return results
results_apple = get_all_tweets(count=1000, q='apple', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31')
results_facebook = get_all_tweets(count=1000, q='facebook', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31')
results_amazon = get_all_tweets(count=1000, q='amazon', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31')
results_apple[0]
tweets = tweepy.Cursor(api.search, q='Apple', tweet_mode='extended').items(1)
one_tweet = tweet[-1]
one_tweet._json
# Download and load FinBert pretrained model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
nlp = pipeline("sentiment-analysis", model = model, tokenizer=tokenizer)
possible_sentiments = ['negative', 'neutral', 'positive']
# Get sentiments
def get_sentiments(input_dict, variable_text):
for item_ in input_dict:
sentiment = sentiment_analysis(item_[variable_text])
for item in sentiment:
for shade in possible_sentiments:
if item['label'] == shade:
item_[shade] = item['score']
else:
item_[shade] = 0
return input_dict
results_apple = get_sentiments(results_apple, 'clean_text')
results_facebook = get_sentiments(results_facebook, 'clean_text')
results_amazon = get_sentiments(results_amazon, 'clean_text')
import os
import re
import tweepy
from tweepy import OAuthHandler
import numpy as np
import pandas as pd
# text treatement
import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#Wordcloud
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image
# Graphs
import plotly.io as pio
pio.renderers.default='browser'
import plotly.express as px
# transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForTokenClassification
pip install -r requirements.txt
# Import Packages
from newscatcherapi import NewsCatcherApiClient
import time
# Initialize NewsCatcher API
newscatcherapi = NewsCatcherApiClient(x_api_key='YOUR-X-API-KEY')
# Extract News
apple_articles = []
facebook_articles = []
amazon_articles = []
for i in range(1, 11):
apple_articles.extend(newscatcherapi.get_search(q='(Apple AND company) OR "Apple Inc"',
lang='en',
from_='2021-10-25',
to_='2021-10-31',
page_size=100,
page=i)['articles'])
time.sleep(1)
facebook_articles.extend(newscatcherapi.get_search(q='(Facebook AND company) OR "Facebook Inc"',
lang='en',
from_='2021-10-25',
to_='2021-10-31',
page_size=100,
page=i)['articles'])
time.sleep(1)
amazon_articles.extend(newscatcherapi.get_search(q='(Amazon AND company) OR "Amazon Inc"',
lang='en',
from_='2021-10-25',
to_='2021-10-31',
page_size=100,
page=i)['articles'])
time.sleep(1)
consumer_key = os.environ['CONSUMER_KEY']
consumer_secret = os.environ['CONSUMER_SECRET']
access_token = os.environ['ACCESS_TOKEN']
access_token_secret = os.environ['ACCESS_SECRET']
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
apple_articles_pd = pd.DataFrame(get_sentiments(apple_articles, 'title')).loc[:, ['negative', 'neutral', 'positive']]
facebook_articles_pd = pd.DataFrame(get_sentiments(facebook_articles, 'title')).loc[:, ['negative', 'neutral', 'positive']]
amazon_articles_pd = pd.DataFrame(get_sentiments(amazon_articles, 'title')).loc[:, ['negative', 'neutral', 'positive']]
total_score_articles = pd.concat([apple_articles_pd.mean(), facebook_articles_pd.mean(), amazon_articles_pd.mean()], axis=1)
total_score_articles = total_score_articles.transpose()
total_score_articles = total_score_articles.reset_index()
total_score_articles.columns = ['Company', 'negative', 'neutral', 'positive']
total_score_articles['Company'] = ['Apple', 'Facebook', 'Amazon']
# Sentiment Score
total_score_articles
# Graph
fig = px.histogram(total_score_articles,
x='Company',
title='Sentiment Score by Company | News Articles',
y= ['negative', 'neutral','positive'],
barmode='group',
color_discrete_sequence=["red", "blue", "green"])
fig.update_xaxes( title='Companies').update_yaxes(title='Sentiment score')
fig.show()
# Create Dataframes
apple_tweets_pd = pd.DataFrame(results_apple).loc[:, ['negative', 'neutral', 'positive']]
facebook_tweets_pd = pd.DataFrame(results_facebook).loc[:, ['negative', 'neutral', 'positive']]
amazon_tweets_pd = pd.DataFrame(results_amazon).loc[:, ['negative', 'neutral', 'positive']]
# Concatanate
total_score_tweets = pd.concat([apple_tweets_pd.mean(), facebook_tweets_pd.mean(), amazon_tweets_pd.mean()], axis=1)
total_score_tweets = total_score_tweets.transpose()
total_score_tweets = total_score_tweets.reset_index()
total_score_tweets.columns = ['Company', 'negative', 'neutral', 'positive']
total_score_tweets['Company'] = ['Apple', 'Facebook', 'Amazon']
total_score_tweets
# Visualize
fig = px.histogram(total_score_tweets,
x='Company',
title='Sentiment Score by Company | Tweets',
y= ['negative', 'neutral','positive'],
barmode='group',
color_discrete_sequence=["red", "blue", "green"])
fig.update_xaxes( title='Companies').update_yaxes(title='Sentiment score')
fig.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment