Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Fetch all county-level administrative divisions of PRC
#!/usr/bin/env python3
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urljoin
import io
import json
homepage = "" # noqa
soup = BeautifulSoup(urlopen(homepage), 'html.parser')
divisions = {}
def generate_admin2_dict(tr):
admin2 = {}
# admin2_name = if is not None else "(直轄縣級行政區)"
admin2_name = "(直轄縣級行政區)"
if is not None:
admin2_name = if is not None else
if admin2_name in ['自治縣', '縣', '市轄區']:
admin2_name = '(' + admin2_name + ')'
admin3 = []
for a in'a'):
postfix = a.findNextSibling(text=True)
if postfix is None or postfix[0] != '*':
admin3_name = a.get_text()
if admin3_name == '無縣級行政區':
admin3_name = '(無縣級行政區)'
if len(admin3) != 0:
admin2[admin2_name] = admin3
return admin2
for link in soup.find_all('a'):
link_text = link.get_text()
if link_text[:9] != 'Template:' or link_text[9] == '中':
page2 = urljoin(homepage, link.get('href')).replace("/wiki/", "/zh-tw/")
soup2 = BeautifulSoup(urlopen(page2), 'html.parser')
admin1_name = soup2.find_all('th', {'class': 'navbox-title'})[0].find_all('a')[2].get('title')
admin2 = {}
cate = soup2.find_all('th', {'class': 'navbox-group'})[0].a.get('title')
print(admin1_name + " : " + cate)
if cate == '地級行政區':
for table in soup2.find_all('table', {'class': 'navbox-subgroup'}):
for tr in table.find_all("tr"):
if is None:
elif cate == '縣級行政區':
for th in soup2.find_all('th', {'class': 'navbox-group'})[1:]:
tr = th.parent
if is None:
divisions[admin1_name] = admin2
with'data.text', 'w', encoding='utf8') as outfile:
json.dump(divisions, outfile, ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment