Skip to content

Instantly share code, notes, and snippets.

@Zabanaa
Created December 3, 2015 22:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Zabanaa/599168f0a0fc274f675e to your computer and use it in GitHub Desktop.
Save Zabanaa/599168f0a0fc274f675e to your computer and use it in GitHub Desktop.
First stab at creating a web crawler using requests and beautiful soup
# This script will attempt to fetch data from Yahoo finance for a particular
# stock, get the prices for the last 6 months and save the info to a csv file
import requests
from bs4 import BeautifulSoup
import csv
import time
def get_prices(url):
print("Fetching Prices ...")
time.sleep(1)
initial_request = requests.get(url) # Making the request
html_response = initial_request.text # Get the html source
soup = BeautifulSoup(html_response, "html.parser") # Turn that into a soup object
tables = soup.find_all("table", {"cellpadding":"2"}) # The CSS selectors could be improved on the site
# the table we are interested in has some generic table
# attributes which makes it difficult to target
stock_table = tables[4] # this is the table containing our stock prices
rows = stock_table.find_all("tr")[1:-1] # we ignore the first and the last rows as they don't contain any price information
prices = [] # This is an empty list that will contain the price information for the stock
for row in rows:
cells = row.find_all("td")
date = cells[0].string
adj_price = cells[-1].string
prices.append([date, adj_price])
print("Prices stored, preparing the info ...")
return prices
def copy_to_csv(price_list, file_name):
prices_file = open(file_name + ".csv", "a")
prices_file_writer = csv.writer(prices_file)
print("Writing prices to csv file ...")
time.sleep(1)
for price in price_list:
prices_file_writer.writerow(price)
prices_file.close()
print("Prices saved to the csv file, open it !")
if __name__ == "__main__":
page = 0
"""
The stock prices table has a row of headings
One problem I came across is that when fetching
and appending them to the prices list (in the get_prices function)
they would repeat in the outputed csv file.
Example: The script would fetch the data for page 1 and write it
to the file, then it would go on to page 2 and add the headings again and so on
for every page.
The short term solution I came up with is to open the file and manually add
the heading titles myself before calling the functions, that way
it doesn't append a new row of headings for each new fetched page.
"""
with open("apple-prices.csv", "w") as apple_prices:
apple_prices_writer = csv.writer(apple_prices)
apple_prices_writer.writerow(["Date", "Adj Close"])
apple_prices.close()
while page < 198:
apple_stock_url = "https://uk.finance.yahoo.com/q/hp?s=AAPL&a=03&b=6&c=2015&d=11&e=2&f=2015&g=d&z=66&y=" + str(page)
print(apple_stock_url)
apple_prices = get_prices(apple_stock_url)
copy_to_csv(apple_prices, "apple-prices")
page += 66
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment