Skip to content

Instantly share code, notes, and snippets.

@mikeyee
Last active December 31, 2021 15:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mikeyee/8d3a1f0931b0fdda9ec77aacd76c6998 to your computer and use it in GitHub Desktop.
Save mikeyee/8d3a1f0931b0fdda9ec77aacd76c6998 to your computer and use it in GitHub Desktop.
到港交所港股通網頁,下載港股通每日持股量數據
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import time
import datetime
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
try:
table=pd.read_csv('northwater.csv')
except IOError:
table = pd.DataFrame(columns=['股票編號','公司名稱'])
table.to_csv('northwater.csv',index = False)
targetpage="http://www.hkexnews.hk/sdw/search/mutualmarket_c.aspx?t=hk"
def gendate(start_date, end_date):
dates = [ start_date + datetime.timedelta(n) for n in range(int ((end_date - start_date).days)+1)]
date_string=[]
for i in range(len(dates)):
temp=f'{dates[i]:%Y%m%d}'
date_string.append(temp)
return date_string
#check if date is loaded already
def checknewdate(date, alldate):
if date in alldate:
return False
else:
return True
def loadpage(targetpage, date):
year=date[:4]
month=date[4:6]
day=date[6:]
inputdate=str(year)+"/"+str(month)+"/"+str(day) #type is string
setdate="document.getElementById('txtShareholdingDate').setAttribute('value','"+inputdate+"')"
driver.get(targetpage)
assert "HKEX" in driver.title
driver.execute_script(setdate)
'''
driver.find_element_by_xpath("//select[@name='ddlShareholdingDay']/option[text()='"+day+"']").click()
driver.find_element_by_xpath("//select[@name='ddlShareholdingMonth']/option[text()='"+month+"']").click()
driver.find_element_by_xpath("//select[@name='ddlShareholdingYear']/option[text()='"+year+"']").click()
'''
driver.find_element_by_name("btnSearch").click()
assert "No results found." not in driver.page_source
#time.sleep(2)
driver.implicitly_wait(10) #more efficient than time.sleep
page=driver.page_source
return page
def readpage(page, date):
soup = BeautifulSoup(page, 'html.parser')
#record=soup.find_all('tr', class_=re.compile(r"^row"))
recordcode=soup.find_all('td',class_='col-stock-code')
recordname=soup.find_all('td',class_='col-stock-name')
recordstock=soup.find_all('td',class_='col-shareholding')
code_tags=[]
name_tags=[]
nostock_tags=[]
for i in range(len(recordcode)):
code_tags.append(int(recordcode[i].select('div')[1].get_text().strip()))
name_tags.append(recordname[i].select('div')[1].get_text().strip())
nostock_tags.append(int(recordstock[i].select('div')[1].get_text().strip().replace(',','')))
# code_tags.append(int(record[i].select('td')[0].get_text().strip()))
# name_tags.append(record[i].select('td')[1].get_text().strip())
# nostock_tags.append(int(record[i].select('td')[2].get_text().strip().replace(',','')))
table = pd.DataFrame({
"股票編號": code_tags,
"公司名稱": name_tags,
date: nostock_tags,
})
return table
def appenddata(table,temptable):
result = pd.merge(table, temptable, on=['股票編號','公司名稱'],how='outer')
return result
start_date = datetime.date(2018,9,1)
end_date = datetime.date(2018,9,29)
dates=gendate(start_date,end_date)
driver = webdriver.Firefox()
for i in range(len(dates)):
if checknewdate(dates[i],alldate):
page=loadpage(targetpage, dates[i])
temptable=readpage(page,dates[i])
table=appenddata(table,temptable)
alldate=list(table)[2:]
driver.close()
table.to_csv('northwater.csv',index = False)
@mikeyee
Copy link
Author

mikeyee commented Nov 29, 2018

update program due to HKEX web page revamp

@dom-chen
Copy link

bug??
line 101, in
if checknewdate(dates[i],alldate):

NameError: name 'alldate' is not defined

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment