Skip to content

Instantly share code, notes, and snippets.

@ayaka14732
Created February 23, 2021 06:13
Show Gist options
  • Save ayaka14732/570aa2faeb9adea8a2c9effb0aef1c7f to your computer and use it in GitHub Desktop.
Save ayaka14732/570aa2faeb9adea8a2c9effb0aef1c7f to your computer and use it in GitHub Desktop.
爬取古音小鏡上某個方言點的字音資料
'''
爬取古音小鏡上某個方言點的字音資料。
設計思路:
1. 從「古音小鏡 > 漢語地理 > 音系速覽 > 某一方言點 > 聲調-聲母配合表」中獲取所有「聲調-聲母配合關係」
2. 對於每個「聲調-聲母配合關係」,獲取相關的所有字及字音
'''
from bs4 import BeautifulSoup
import logging
import requests
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M',
)
def parse_outer(f, url):
logging.info('爬取 ' + url + '...')
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.select_one('table:nth-of-type(3)')
assert table.select_one('caption').text == '聲調-聲母配合表', '定位到聲調-聲母配合表'
for a in table.select('td a:not(:empty)'):
url = 'http://sino.kaom.net/' + a['href']
parse_inner(f, url)
def parse_inner(f, url):
logging.info('爬取 ' + url + '...')
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.select_one('table')
assert sum(1 for _ in table.select_one('tr').children) == 6, '字表有六列'
rowspan_counter = [0] * 6
last_data = [None] * 6
for td in table.select('tr + tr'): # 解析表格的每個 row
data = [] # 存放 6 個 tr 內字串
data_fields = td.children
for i in range(6):
if rowspan_counter[i] > 0: # 特殊情況,有 rowspan
data.append(last_data[i])
rowspan_counter[i] -= 1
else: # 無 rowspan
td = next(data_fields)
data.append(td.text)
if td.get('rowspan') is not None:
rowspan_counter[i] = int(td.get('rowspan')) - 1
字頭, _, 語音, 聲調, 註語, _ = data # pylint: disable=unbalanced-tuple-unpacking
print(字頭, 語音, 聲調, 註語, sep='\t', file=f)
last_data = data
with open('data.csv', 'w') as f:
parse_outer(f, 'http://sino.kaom.net/si_box8.php?c=Z005') # 修改此處為欲爬取的方言點 url
# parse_inner('http://sino.kaom.net/si_box88.php?c=2%E2%98%86Z005%E2%98%86p%E2%98%8621') # 用於測試
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment