Skip to content

Instantly share code, notes, and snippets.

@cwchentw
Created October 14, 2018 23:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cwchentw/34a7d3f97ab69716f16bb545d589dafd to your computer and use it in GitHub Desktop.
Save cwchentw/34a7d3f97ab69716f16bb545d589dafd to your computer and use it in GitHub Desktop.
Yahoo Finance Crawler as a Python script
#!/usr/bin/env python
##############################################################################
# fetchStockData.py - a friendly Yahoo Finance crawler. #
# #
# Requirements #
# #
# - Python 3 #
# - Selenium package for Python #
# - The web driver for Chrome #
# #
# 2018, Michael Chen; Apache 2.0 #
##############################################################################
import os
import sys
import time
from random import randint
from selenium import webdriver
# Parameters
targetAsset = None
timeSpan = "5Y"
downloadPath = os.path.dirname(os.path.abspath(__file__))
if len(sys.argv) < 2:
sys.stderr.write("No valid target asset\n")
sys.exit(1)
targetAsset = sys.argv[1]
# Change default downloading path.
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : downloadPath}
options.add_experimental_option('prefs', prefs)
# Create a new instance of the Chrome driver
driver = webdriver.Chrome(chrome_options=options)
# Go to the Yahoo Finance page
driver.get("https://finance.yahoo.com/")
# Wait the page to fresh.
time.sleep(10)
# Find the input box.
inputElement = driver.find_element_by_css_selector("#fin-srch-assist input")
# Type in the search.
inputElement.send_keys(targetAsset)
# Submit the form.
inputElement.submit()
# Wait the page to fresh.
time.sleep(10)
# Select the subpage of Historical Data.
items = driver.find_elements_by_css_selector("a span")
for item in items:
if item.text == "Historical Data":
item.click()
break
time.sleep(randint(1, 3)) # Simulate idling
# Help Yahoo to earn some money.
ad = driver.find_element_by_css_selector("#sb_rel_defaultdestLDRB")
ad.click()
time.sleep(randint(1, 3)) # Simulate idling
# Select the dialog.
arrows = driver.find_elements_by_css_selector(".historical div div span svg")
arrows[0].click()
time.sleep(randint(1, 3)) # Simulate idling
# Select some time span.
durations = driver.find_elements_by_css_selector("[data-test=\"date-picker-menu\"] div span")
for duration in durations:
if duration.text == timeSpan:
duration.click()
break
time.sleep(randint(1, 3)) # Simulate idling
# Finish the selection by clicking Done button.
buttons = driver.find_elements_by_css_selector("[data-test=\"date-picker-menu\"] div button")
for button in buttons:
if button.text == "Done":
button.click()
break
time.sleep(randint(1, 3)) # Simulate idling
# Apply the change.
buttons = driver.find_elements_by_css_selector("button span")
for button in buttons:
if button.text == "Apply":
button.click()
break
time.sleep(randint(1, 3)) # Simulate idling
# Remove old file.
csvpath = os.path.join(downloadPath, "%s.csv" % targetAsset)
if os.path.exists(csvpath):
os.remove(csvpath)
# Download the data.
links = driver.find_elements_by_css_selector("a span")
for link in links:
if link.text == "Download Data":
link.click()
break
time.sleep(5)
# Close the browser.
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment