Last active
January 18, 2023 00:07
-
-
Save dray89/46a982956d9667474e2cfcedf07406a0 to your computer and use it in GitHub Desktop.
Python Scrape Yahoo Stock Market Statistics Webpage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from bs4 import BeautifulSoup | |
import requests, lxml | |
from lxml import html | |
class statistics: | |
base_url = "https://finance.yahoo.com/" | |
def __init__(self, symbol): | |
''' | |
:param symbol: stock symbol in all caps | |
please note that any Canadian TSX stocks are followed with ".TO" ... check the relevant URL for formatting. | |
''' | |
self.symbol = symbol.upper() | |
self.path = "quote/{0}/key-statistics?p={0}".format(symbol) | |
self.url = self.base_url + self.path | |
self.methods = ['scrape_page', 'label_stats'] | |
self.attributes = ['self.symbol', 'self.path', 'self.url','self.methods', 'self.hdrs', \ | |
'self.valuation', 'self.fiscal_year', \ | |
'self.profitability', 'self.manager_effect', \ | |
'self.income_statement', 'self.balance_sheet', 'self.cash_statement', \ | |
'self.price_history', 'self.share_stats', 'self.dividendSplit'] | |
self.hdrs = {"authority": "finance.yahoo.com", | |
"method": "GET", | |
"path": self.path, | |
"scheme": "https", | |
"accept": "text/html,application/xml;q=0.9", | |
"accept-encoding": "gzip, deflate, br", | |
"accept-language": "en-US,en;q=0.9", | |
"referer": self.base_url, | |
"sec-fetch-mode": "navigate", | |
"sec-fetch-site": "same-origin", | |
"sec-fetch-user": "?1", | |
"upgrade-insecure-requests": "1", | |
"user-agent": "Mozilla/5.0 (Windows NT 10.0;)"} | |
def scrape_page(self): | |
''' | |
:return: scrapes the content of the class URL, | |
using headers defined in the init function, | |
returning a byte string of html code. | |
''' | |
page = requests.get(self.url, headers=self.hdrs) | |
soup = BeautifulSoup(page.content, 'lxml') | |
tables = soup.find_all('table') | |
iterator = range(0, len(tables)) | |
function = lambda x: pd.read_html(str(tables[x])) | |
table_list = list(map(function, iterator)) | |
return table_list | |
def label_stats(self, table_list): | |
''' | |
:param table_list: uses the output of the scrape_page method | |
:return: creates attributes for the statistics class object, | |
uses indexLabel method to label columns and set the dataframes' index | |
''' | |
iterator = [table_list[i][0] for i in range(0, len(table_list))] | |
table_list = list(map(lambda df: self.__indexLabel__(df), iterator)) | |
self.valuation, self.fiscal_year, self.profitability, self.manager_effect, \ | |
self.income_statement, self.balance_sheet, self.cash_statement, \ | |
self.price_history, self.share_stats, self.dividendSplit = table_list | |
return table_list | |
def __indexLabel__(self, df): | |
''' | |
:param df: Takes a dataframe as input. | |
:return: returns a dataframe with column labels and a set index. | |
''' | |
df.columns = ['Measure', 'Value'] | |
df = df.set_index('Measure') | |
return df | |
if __name__ == "__main__": | |
shopify_stats = statistics('SHOP') | |
table_list = shopify_stats.scrape_page() | |
table_list = shopify_stats.label_stats(table_list) |
The pandas module has probably been updated since I wrote this. I’d suggest creating a virtual environment with pandas installed at a prior version back in 2018-2019 or updating the code to be compatible with the new version.
…Sent from my iPad
On Apr 12, 2022, at 12:38 PM, zhijiaodaniellegoh ***@***.***> wrote:
@zhijiaodaniellegoh commented on this gist.
I've got "module 'pandas' has no attribute 'read_html'", do anyone knows how to solve it?
—
Reply to this email directly, view it on GitHub, or unsubscribe.
You are receiving this because you authored the thread.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I ran your code and i had this error :
I printed the table_list and it appears empty.i think this is the problem, so that i cant get the sumarry values of the stock. How can i solve this? Thanks in advance