Last active
August 28, 2023 05:08
-
-
Save cwchentw/50b71fa9b7aab702af0003ec6002cc6e to your computer and use it in GitHub Desktop.
Taiwan Foreclosure Data Crawler as a Python script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
############################################################################## | |
# fetchForeclosureData.py # | |
# # | |
# Requirements # | |
# # | |
# - Python 3 # | |
# - Selenium package for Python # | |
# - The web driver for Chrome # | |
# # | |
# Copyright (c) 2018-2020, Michael Chen; Apache 2.0 # | |
############################################################################## | |
import csv | |
import datetime | |
import os | |
import sys | |
import time | |
from selenium import webdriver | |
targetCounty = "新北市" | |
targetDistrict = "板橋區" | |
now = datetime.datetime.now() | |
threeYearAgo = str(now.year - 1911 - 3) | |
downloadPath = os.path.dirname(os.path.abspath(__file__)) | |
# Change Chrome options: (1) default downloading path, (2) disable popup blocking. | |
options = webdriver.ChromeOptions() | |
prefs = {} | |
prefs["download.default_directory"] = downloadPath | |
prefs["profile.default_content_settings"] = { "popups": 1 } | |
options.add_experimental_option('prefs', prefs) | |
# Create a new instance of the Chrome driver | |
driver = webdriver.Chrome(options=options) | |
# Go to the foreclosure house page. | |
driver.get("https://pip.moi.gov.tw/V2/A/SCRA0601.aspx") | |
time.sleep(6) # Wait the page to fresh. | |
# Trick to handle site redirecting. | |
driver.get("https://pip.moi.gov.tw/V2/A/SCRA0601.aspx") | |
time.sleep(6) # Wait the page to fresh. | |
# Open the county menu. | |
countyMenu = driver.find_element_by_css_selector("select[name=\"ctl00$ContentPlaceHolder1$ddlCity\"]") | |
countyMenu.click() | |
time.sleep(2) # Wait the page to fresh. | |
# Select targte county. | |
countyOptions = driver.find_elements_by_css_selector("select[name=\"ctl00$ContentPlaceHolder1$ddlCity\"] option") | |
for option in countyOptions: | |
if targetCounty in option.text: | |
option.click() | |
break | |
time.sleep(4) # Wait the page to fresh. | |
# Open the district menu. | |
districtMenu = driver.find_element_by_css_selector("select[name=\"ctl00$ContentPlaceHolder1$ddlTown\"]") | |
districtMenu.click() | |
time.sleep(2) # Wait the page to fresh. | |
# Select targte district. | |
districtOptions = driver.find_elements_by_css_selector("select[name=\"ctl00$ContentPlaceHolder1$ddlTown\"] option") | |
for option in districtOptions: | |
if targetDistrict in option.text: | |
option.click() | |
break | |
time.sleep(2) # Wait the page to fresh. | |
# Switch district menu. | |
districtMenu.click() | |
time.sleep(2) # Wait the page to fresh. | |
# Open the starting year menu. | |
startYearMenu = driver.find_element_by_css_selector("select[name=\"ctl00$ContentPlaceHolder1$ddlYear1\"]") | |
startYearMenu.click() | |
time.sleep(2) # Wait the page to fresh. | |
# Select the starting year. | |
startYears = driver.find_elements_by_css_selector("select[name=\"ctl00$ContentPlaceHolder1$ddlYear1\"] option") | |
for option in startYears: | |
if threeYearAgo in option.text: | |
option.click() | |
break | |
time.sleep(2) # Wait the page to fresh. | |
# Close the starting year menu. | |
startYearMenu.click() | |
time.sleep(2) # Wait the page to fresh. | |
# Submit our query. | |
submitButton = driver.find_element_by_css_selector("[name=\"ctl00$ContentPlaceHolder1$btnQuery\"]") | |
submitButton.click() | |
time.sleep(4) # Wait the page to fresh. | |
# Remove old Excel file. | |
excelPath = os.path.join(downloadPath, "%5b拍賣拍定價%5d查詢結果下載.xls") | |
if os.path.exists(excelPath): | |
os.remove(excelPath) | |
# Download new Excel file. | |
link = driver.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_btnExportExcel") | |
link.click() | |
time.sleep(5) # Wait the page to fresh. | |
# Close the browser. | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment