Created
May 13, 2020 15:45
-
-
Save s0g00d/264ec9244230fc312666bc2f4fa4f102 to your computer and use it in GitHub Desktop.
Stock Headline Sentiment Analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gspread | |
from oauth2client.service_account import ServiceAccountCredentials | |
from os import path | |
from bs4 import BeautifulSoup as bs | |
import requests | |
import pandas as pd | |
from pandas import DataFrame | |
from urllib.request import Request, urlopen | |
import re # import Regular expression library | |
DATA_DIR = 'C:/Users/xbsqu/Desktop/Python Learning/Projects/Premarket Stock Price' | |
#Connecting to G Sheet... | |
scope = ['https://spreadsheets.google.com/feeds', | |
'https://www.googleapis.com/auth/drive'] | |
creds = ServiceAccountCredentials.from_json_keyfile_name(path.join(DATA_DIR, 'client_secret.json'), scope) | |
client = gspread.authorize(creds) | |
sheet = client.open('Stock Watcher') | |
worksheet = sheet.get_worksheet(2) #Will need to update this to the live sheet | |
#Now connected to the G Sheet. | |
#We need to send a header so as to not get 403 errors | |
stock_url = 'https://finviz.com/quote.ashx?t=RCL' | |
req = Request(stock_url, headers={'User-Agent': 'Mozilla/5.0'}) | |
webpage = urlopen(req).read() | |
#Make the soup call & scrape the table | |
page_soup = bs(webpage,'html.parser') | |
headline_table = page_soup.find('table',{'class': 'fullview-news-outer'}) | |
#Remove the publishing blog name | |
for span_tag in headline_table.findAll('span'): | |
span_tag.decompose() | |
#Turn the Soup data into a df and add headers | |
headline_table = pd.read_html(str(headline_table))[0] | |
headline_table.columns = ['Date', 'Title'] | |
#print(headline_table.head()) | |
#Now we need to fix the date column | |
date_column = headline_table['Date'] | |
short_date = date_column.str.slice(stop=9) | |
for date in date_column: | |
if re.search(r'^[A-Z]', date): | |
date_column.replace(date, short_date) | |
continue | |
else: | |
date == "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment