NewscatcherAPI/clean_text.py

## clean_text.py
def clean_text(text, all_mentions):
    # If retweet, delete RT and name of the account
    text = re.sub('(RT\s.*):', '', text)
    # Find all links and delete them
    all_links = re.findall('(https:.*?)\s', text + ' ')

    for i in all_links:
        text = text.replace(i, '')

    for i in all_mentions:
        text = text.replace('@' + i, '')

    # Tokens
    tokens = word_tokenize(text.replace('-', ' '))
    # convert to lower case
    tokens = [w.lower() for w in tokens ]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    phrase = " ".join(words)

    return phrase, all_links

for i in results_apple:
    i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']])
for i in results_facebook:
    i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']])
for i in results_amazon:
    i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']])

results_apple[0]

## get_all_tweets.py
variables_we_need = ['created_at', 'id', 'full_text', 'entities', 'user', 'coordinates', 'retweet_count', 'favorite_count', 'lang']

def get_all_tweets(count=100, q='', lang='', since='', tweet_mode='extended'):
    results = []

    tweets = tweepy.Cursor(api.search, q=q, lang=lang, since=since, tweet_mode=tweet_mode).items(count)

    for tweet in tweets:
        d = {}
        for variable in variables_we_need:
            d[variable] = tweet._json[variable]
        results.append(d)

    return results

## get_all_tweets_three_companies.py
results_apple = get_all_tweets(count=1000, q='apple', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31')
results_facebook = get_all_tweets(count=1000, q='facebook', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31')
results_amazon = get_all_tweets(count=1000, q='amazon', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31')
results_apple[0]

## get_one_tweet.py
tweets = tweepy.Cursor(api.search, q='Apple', tweet_mode='extended').items(1)
one_tweet = tweet[-1]
one_tweet._json

## get_sentiments_finbert.py
# Download and load FinBert pretrained model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

nlp = pipeline("sentiment-analysis", model = model, tokenizer=tokenizer)

possible_sentiments = ['negative', 'neutral', 'positive']

# Get sentiments
def get_sentiments(input_dict, variable_text):

    for item_ in input_dict:
        sentiment = sentiment_analysis(item_[variable_text])
        for item in sentiment:
            for shade in possible_sentiments:
                if item['label'] == shade:
                    item_[shade] = item['score']
                else:
                    item_[shade] = 0

    return input_dict

results_apple = get_sentiments(results_apple, 'clean_text')
results_facebook = get_sentiments(results_facebook, 'clean_text')
results_amazon = get_sentiments(results_amazon, 'clean_text')

## get_twitter_trends.py
trends_result = api.trends_place(id=2459115)[0]['trends']
trends = {}
for i in trends_result:
    trends[i['name']] = i['tweet_volume']
trends_names = ' '.join(list(trends.keys()))

## import_libraries.py
import os
import re
import tweepy
from tweepy import OAuthHandler
import numpy as np
import pandas as pd

# text treatement
import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#Wordcloud
from wordcloud import WordCloud,  ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image

# Graphs
import plotly.io as pio
pio.renderers.default='browser'
import plotly.express as px

# transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForTokenClassification

## install.sh
pip install -r requirements.txt

## newscatcherapi_extract.py
# Import Packages
from newscatcherapi import NewsCatcherApiClient
import time

# Initialize NewsCatcher API
newscatcherapi = NewsCatcherApiClient(x_api_key='YOUR-X-API-KEY')

# Extract News
apple_articles = []
facebook_articles = []
amazon_articles = []

for i in range(1, 11):
    apple_articles.extend(newscatcherapi.get_search(q='(Apple AND company) OR "Apple Inc"',
                                         lang='en',
                                         from_='2021-10-25',
                                         to_='2021-10-31',
                                         page_size=100,
                                         page=i)['articles'])
    time.sleep(1)

    facebook_articles.extend(newscatcherapi.get_search(q='(Facebook AND company) OR "Facebook Inc"',
                                     lang='en',
                                     from_='2021-10-25',
                                     to_='2021-10-31',
                                     page_size=100,
                                     page=i)['articles'])

    time.sleep(1)

    amazon_articles.extend(newscatcherapi.get_search(q='(Amazon AND company) OR "Amazon Inc"',
                                     lang='en',
                                     from_='2021-10-25',
                                     to_='2021-10-31',
                                     page_size=100,
                                     page=i)['articles'])

    time.sleep(1)

## show_twitter_trends.py
# Image of corona
hashtag = np.array(Image.open("hashtag.jpg"))
hashtag[hashtag == 0] = 255

wordcloud = WordCloud(background_color="white",max_words=200, mask=hashtag, contour_width=3, contour_color='firebrick', collocations=False).generate(trends_names)
plt.figure(figsize=[20,10])
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

## twitter_api_conn.py
consumer_key = os.environ['CONSUMER_KEY']
consumer_secret = os.environ['CONSUMER_SECRET']
access_token = os.environ['ACCESS_TOKEN']
access_token_secret = os.environ['ACCESS_SECRET']


auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

## visualize_newscatcherapi.py
apple_articles_pd = pd.DataFrame(get_sentiments(apple_articles, 'title')).loc[:, ['negative', 'neutral', 'positive']]
facebook_articles_pd = pd.DataFrame(get_sentiments(facebook_articles, 'title')).loc[:, ['negative', 'neutral', 'positive']]
amazon_articles_pd = pd.DataFrame(get_sentiments(amazon_articles, 'title')).loc[:, ['negative', 'neutral', 'positive']]

total_score_articles = pd.concat([apple_articles_pd.mean(), facebook_articles_pd.mean(), amazon_articles_pd.mean()], axis=1)
total_score_articles = total_score_articles.transpose()
total_score_articles = total_score_articles.reset_index()
total_score_articles.columns = ['Company', 'negative', 'neutral', 'positive']
total_score_articles['Company'] = ['Apple', 'Facebook', 'Amazon']

# Sentiment Score
total_score_articles

# Graph
fig = px.histogram(total_score_articles,
                   x='Company',
                   title='Sentiment Score by Company | News Articles',
                   y= ['negative', 'neutral','positive'],
                   barmode='group',
                  color_discrete_sequence=["red", "blue", "green"])
fig.update_xaxes( title='Companies').update_yaxes(title='Sentiment score')
fig.show()

## visualize_results.py
# Create Dataframes
apple_tweets_pd = pd.DataFrame(results_apple).loc[:, ['negative', 'neutral', 'positive']]
facebook_tweets_pd = pd.DataFrame(results_facebook).loc[:, ['negative', 'neutral', 'positive']]
amazon_tweets_pd = pd.DataFrame(results_amazon).loc[:, ['negative', 'neutral', 'positive']]

# Concatanate
total_score_tweets = pd.concat([apple_tweets_pd.mean(), facebook_tweets_pd.mean(), amazon_tweets_pd.mean()], axis=1)
total_score_tweets = total_score_tweets.transpose()
total_score_tweets = total_score_tweets.reset_index()
total_score_tweets.columns = ['Company', 'negative', 'neutral', 'positive']
total_score_tweets['Company'] = ['Apple', 'Facebook', 'Amazon']

total_score_tweets

# Visualize
fig = px.histogram(total_score_tweets,
                   x='Company',
                   title='Sentiment Score by Company | Tweets',
                   y= ['negative', 'neutral','positive'],
                   barmode='group',
                  color_discrete_sequence=["red", "blue", "green"])
fig.update_xaxes( title='Companies').update_yaxes(title='Sentiment score')
fig.show()
	def clean_text(text, all_mentions):
	# If retweet, delete RT and name of the account
	text = re.sub('(RT\s.*):', '', text)
	# Find all links and delete them
	all_links = re.findall('(https:.*?)\s', text + ' ')

	for i in all_links:
	text = text.replace(i, '')

	for i in all_mentions:
	text = text.replace('@' + i, '')

	# Tokens
	tokens = word_tokenize(text.replace('-', ' '))
	# convert to lower case
	tokens = [w.lower() for w in tokens ]
	# remove punctuation from each word
	table = str.maketrans('', '', string.punctuation)
	stripped = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	words = [word for word in stripped if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	words = [w for w in words if not w in stop_words]
	phrase = " ".join(words)

	return phrase, all_links

	for i in results_apple:
	i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']])
	for i in results_facebook:
	i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']])
	for i in results_amazon:
	i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']])

	results_apple[0]
	variables_we_need = ['created_at', 'id', 'full_text', 'entities', 'user', 'coordinates', 'retweet_count', 'favorite_count', 'lang']

	def get_all_tweets(count=100, q='', lang='', since='', tweet_mode='extended'):
	results = []

	tweets = tweepy.Cursor(api.search, q=q, lang=lang, since=since, tweet_mode=tweet_mode).items(count)

	for tweet in tweets:
	d = {}
	for variable in variables_we_need:
	d[variable] = tweet._json[variable]
	results.append(d)

	return results
	results_apple = get_all_tweets(count=1000, q='apple', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31')
	results_facebook = get_all_tweets(count=1000, q='facebook', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31')
	results_amazon = get_all_tweets(count=1000, q='amazon', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31')
	results_apple[0]
	tweets = tweepy.Cursor(api.search, q='Apple', tweet_mode='extended').items(1)
	one_tweet = tweet[-1]
	one_tweet._json
	# Download and load FinBert pretrained model
	tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

	model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

	nlp = pipeline("sentiment-analysis", model = model, tokenizer=tokenizer)

	possible_sentiments = ['negative', 'neutral', 'positive']

	# Get sentiments
	def get_sentiments(input_dict, variable_text):

	for item_ in input_dict:
	sentiment = sentiment_analysis(item_[variable_text])
	for item in sentiment:
	for shade in possible_sentiments:
	if item['label'] == shade:
	item_[shade] = item['score']
	else:
	item_[shade] = 0

	return input_dict

	results_apple = get_sentiments(results_apple, 'clean_text')
	results_facebook = get_sentiments(results_facebook, 'clean_text')
	results_amazon = get_sentiments(results_amazon, 'clean_text')
	trends_result = api.trends_place(id=2459115)[0]['trends']
	trends = {}
	for i in trends_result:
	trends[i['name']] = i['tweet_volume']
	trends_names = ' '.join(list(trends.keys()))
	import os
	import re
	import tweepy
	from tweepy import OAuthHandler
	import numpy as np
	import pandas as pd

	# text treatement
	import nltk
	from nltk.tokenize import word_tokenize
	import string
	from nltk.corpus import stopwords
	from nltk.stem.porter import PorterStemmer
	from nltk.sentiment.vader import SentimentIntensityAnalyzer

	#Wordcloud
	from wordcloud import WordCloud, ImageColorGenerator
	import matplotlib.pyplot as plt
	from PIL import Image

	# Graphs
	import plotly.io as pio
	pio.renderers.default='browser'
	import plotly.express as px

	# transformers
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForTokenClassification
	# Import Packages
	from newscatcherapi import NewsCatcherApiClient
	import time

	# Initialize NewsCatcher API
	newscatcherapi = NewsCatcherApiClient(x_api_key='YOUR-X-API-KEY')

	# Extract News
	apple_articles = []
	facebook_articles = []
	amazon_articles = []

	for i in range(1, 11):
	apple_articles.extend(newscatcherapi.get_search(q='(Apple AND company) OR "Apple Inc"',
	lang='en',
	from_='2021-10-25',
	to_='2021-10-31',
	page_size=100,
	page=i)['articles'])
	time.sleep(1)

	facebook_articles.extend(newscatcherapi.get_search(q='(Facebook AND company) OR "Facebook Inc"',
	lang='en',
	from_='2021-10-25',
	to_='2021-10-31',
	page_size=100,
	page=i)['articles'])

	time.sleep(1)

	amazon_articles.extend(newscatcherapi.get_search(q='(Amazon AND company) OR "Amazon Inc"',
	lang='en',
	from_='2021-10-25',
	to_='2021-10-31',
	page_size=100,
	page=i)['articles'])

	time.sleep(1)
	# Image of corona
	hashtag = np.array(Image.open("hashtag.jpg"))
	hashtag[hashtag == 0] = 255

	wordcloud = WordCloud(background_color="white",max_words=200, mask=hashtag, contour_width=3, contour_color='firebrick', collocations=False).generate(trends_names)
	plt.figure(figsize=[20,10])
	plt.imshow(wordcloud, interpolation="bilinear")
	plt.axis("off")
	plt.show()
	consumer_key = os.environ['CONSUMER_KEY']
	consumer_secret = os.environ['CONSUMER_SECRET']
	access_token = os.environ['ACCESS_TOKEN']
	access_token_secret = os.environ['ACCESS_SECRET']


	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.set_access_token(access_token, access_token_secret)
	api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)