Created
February 23, 2021 06:13
-
-
Save ayaka14732/570aa2faeb9adea8a2c9effb0aef1c7f to your computer and use it in GitHub Desktop.
爬取古音小鏡上某個方言點的字音資料
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
爬取古音小鏡上某個方言點的字音資料。 | |
設計思路: | |
1. 從「古音小鏡 > 漢語地理 > 音系速覽 > 某一方言點 > 聲調-聲母配合表」中獲取所有「聲調-聲母配合關係」 | |
2. 對於每個「聲調-聲母配合關係」,獲取相關的所有字及字音 | |
''' | |
from bs4 import BeautifulSoup | |
import logging | |
import requests | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s %(levelname)s %(message)s', | |
datefmt='%Y-%m-%d %H:%M', | |
) | |
def parse_outer(f, url): | |
logging.info('爬取 ' + url + '...') | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
table = soup.select_one('table:nth-of-type(3)') | |
assert table.select_one('caption').text == '聲調-聲母配合表', '定位到聲調-聲母配合表' | |
for a in table.select('td a:not(:empty)'): | |
url = 'http://sino.kaom.net/' + a['href'] | |
parse_inner(f, url) | |
def parse_inner(f, url): | |
logging.info('爬取 ' + url + '...') | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
table = soup.select_one('table') | |
assert sum(1 for _ in table.select_one('tr').children) == 6, '字表有六列' | |
rowspan_counter = [0] * 6 | |
last_data = [None] * 6 | |
for td in table.select('tr + tr'): # 解析表格的每個 row | |
data = [] # 存放 6 個 tr 內字串 | |
data_fields = td.children | |
for i in range(6): | |
if rowspan_counter[i] > 0: # 特殊情況,有 rowspan | |
data.append(last_data[i]) | |
rowspan_counter[i] -= 1 | |
else: # 無 rowspan | |
td = next(data_fields) | |
data.append(td.text) | |
if td.get('rowspan') is not None: | |
rowspan_counter[i] = int(td.get('rowspan')) - 1 | |
字頭, _, 語音, 聲調, 註語, _ = data # pylint: disable=unbalanced-tuple-unpacking | |
print(字頭, 語音, 聲調, 註語, sep='\t', file=f) | |
last_data = data | |
with open('data.csv', 'w') as f: | |
parse_outer(f, 'http://sino.kaom.net/si_box8.php?c=Z005') # 修改此處為欲爬取的方言點 url | |
# parse_inner('http://sino.kaom.net/si_box88.php?c=2%E2%98%86Z005%E2%98%86p%E2%98%8621') # 用於測試 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment