Skip to content

Instantly share code, notes, and snippets.

@iigmir
Created June 1, 2024 08:13
Show Gist options
  • Save iigmir/cf15b7107c96664356ff91141d1ccaec to your computer and use it in GitHub Desktop.
Save iigmir/cf15b7107c96664356ff91141d1ccaec to your computer and use it in GitHub Desktop.
Fetch lemma pages
import requests
page_size = 50
total = 205
page = 1
# It should be: `https://www.taichung.gov.tw/10026/Lpsimplelist?Page=1&PageSize=50&type=`
def get_api(page = 0, page_size = 0, total = 0):
return "https://www.taichung.gov.tw/10026/Lpsimplelist?Page=" + str(page) + "&PageSize=" + str(page_size) + "&type="
def get_file(page = 0):
return str(page) + ".html"
def get_folder():
return "./result"
def get_page(url, filename = ""):
try:
response = requests.get(url)
response.raise_for_status()
res_path = os.path.join( get_folder(), filename)
with open(res_path, "wb") as file:
file.write(response.content)
print(f"{filename} has downloaded")
except requests.exceptions.RequestException as e:
print(f"Failed to download {url}: {e}")
def fetch_page(page = 0, page_size = 0, total = 0):
get_page(get_api(page, page_size, total), get_file(page) )
if( (page + 1) * page_size > total ):
page += 1
get_page(get_api(page, page_size, total), get_file(page) )
print("Done")
return
else:
fetch_page(page + 1, page_size, total)
fetch_page(page, page_size, total)
from bs4 import BeautifulSoup
import os
import json
def export_file(result = [], num = 1):
folder = "./json"
filename = str(num) + ".json"
output_path = os.path.join( folder, filename )
result_json = json.dumps(result, ensure_ascii=False, indent=4)
with open(output_path, "wb") as file:
# https://stackoverflow.com/a/37376668/7162445
file.write(result_json.encode())
def main(num = 1):
src_file = "./result/" + str(num) + ".html"
with open(src_file, 'r', encoding='utf-8') as file:
html = file.read()
soup = BeautifulSoup(html, 'html.parser')
rows = soup.select(".list table tr")
result = []
for row in rows:
# zh_lemma_element = row.select_one('td[data-title="中文詞彙"]')
# en_lemma_element = row.select_one('td[data-title="英文詞彙"]')
zh_lemma_element = row.select_one('td:nth-child(2)')
en_lemma_element = row.select_one('td:nth-child(3)')
if zh_lemma_element and en_lemma_element:
result.append({
# https://www.datacamp.com/tutorial/python-trim
'zh': zh_lemma_element.text.strip(),
'en': en_lemma_element.text.strip()
})
export_file(result, num)
def resc(num):
if(num < 6):
main(num)
resc(num + 1)
else:
return
resc(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment