Created
April 26, 2019 04:35
-
-
Save cwchentw/792809e5c7d6ab53bc11e6892df0be52 to your computer and use it in GitHub Desktop.
TAIEX web crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
############################################################################## | |
# fetchTAIEX.py # | |
# # | |
# Requirements # | |
# # | |
# - Python 3 # | |
# - Selenium package for Python # | |
# - The web driver for Chrome # | |
# # | |
# 2018, Michael Chen; Apache 2.0 # | |
############################################################################## | |
import csv | |
import os | |
import sys | |
import time | |
import datetime | |
from selenium import webdriver | |
validDurations = ['YTD', '1Y', '3Y', '5Y', '10Y', 'Max'] | |
duration = 'YTD' | |
if len(sys.argv) < 2: | |
sys.stderr.write("No valid duration. Default to YTD.\n") | |
else: | |
duration = sys.argv[1] | |
if not duration in validDurations: | |
sys.stderr.write("Invalid duration: %s\n\n" % duration) | |
sys.stderr.write("Valid durations:\n") | |
for d in validDurations: | |
sys.stderr.write("\t%s\n" % d) | |
now = datetime.datetime.now() | |
year = None | |
month = now.month | |
if duration == 'YTD': | |
year = now.year | |
month = 1 | |
elif duration == '1Y': | |
year = now.year - 1 | |
elif duration == '3Y': | |
year = now.year - 3 | |
elif duration == '5Y': | |
year = now.year - 5 | |
elif duration == '10Y': | |
year = now.year - 10 | |
elif duration == 'Max': | |
year = 88 + 1911 | |
month = 1 | |
# Create a new instance of the Chrome driver | |
driver = webdriver.Chrome() | |
# Go to TAIEX page | |
driver.get("http://www.twse.com.tw/zh/page/trading/indices/MI_5MINS_HIST.html") | |
# Wait the page to fresh. | |
time.sleep(10) | |
queryBtn = driver.find_element_by_css_selector(".main a") | |
# Wait the page to fresh. | |
time.sleep(3) | |
def monToStr(m): | |
if m < 10: | |
return '0' + str(m) | |
else: | |
return str(m) | |
sys.stderr.write("Fetch data from the website...\n") | |
pastDateStr = "%d%s" % (year, monToStr(month)) | |
currDateStr = "%d%s" % (now.year, monToStr(now.month)) | |
data = [] | |
isEnd = False | |
currYear = year | |
currMonth = month | |
# Select the initial year. | |
ys = driver.find_elements_by_css_selector("select[name=\"yy\"] option") | |
for y in ys: | |
if y.get_attribute("value") == str(currYear): | |
y.click() | |
time.sleep(2) | |
break | |
while not isEnd: | |
if currYear < now.year: | |
if currMonth <= 12: | |
ms = driver.find_elements_by_css_selector("select[name=\"mm\"] option") | |
for m in ms: | |
if m.get_attribute("value") == str(currMonth): | |
m.click() | |
time.sleep(2) | |
queryBtn.click() | |
time.sleep(3) | |
items = driver.find_elements_by_css_selector("#report-table_wrapper tbody tr") | |
for item in items: | |
tds = item.find_elements_by_css_selector("td") | |
data.append([td.text for td in tds]) | |
break | |
currMonth += 1 | |
else: | |
currMonth = 1 | |
currYear += 1 | |
# Update the year when one year progresses. | |
ys = driver.find_elements_by_css_selector("select[name=\"yy\"] option") | |
for y in ys: | |
if y.get_attribute("value") == str(currYear): | |
y.click() | |
time.sleep(2) | |
break | |
else: | |
if currMonth <= now.month: | |
ms = driver.find_elements_by_css_selector("select[name=\"mm\"] option") | |
for m in ms: | |
if m.get_attribute("value") == str(currMonth): | |
m.click() | |
time.sleep(2) | |
queryBtn.click() | |
time.sleep(3) | |
items = driver.find_elements_by_css_selector("#report-table_wrapper tbody tr") | |
for item in items: | |
tds = item.find_elements_by_css_selector("td") | |
data.append([td.text for td in tds]) | |
break | |
currMonth += 1 | |
else: | |
isEnd = True | |
sys.stderr.write("Write data to csv file...\n") | |
with open("TAIEX_%sto%s.csv" % (pastDateStr, currDateStr), 'w', newline='') as csvfile: | |
csvwriter = csv.writer(csvfile) | |
csvwriter.writerow(["Date", "Open", "High", "Low", "Close"]) | |
for d in data: | |
csvwriter.writerow(d) | |
# Close the browser. | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment