Skip to content

Instantly share code, notes, and snippets.

@xNihil0
Last active October 19, 2020 13:55
Show Gist options
  • Save xNihil0/2f9673fa6b7f33f37cc5ea1cb728d7db to your computer and use it in GitHub Desktop.
Save xNihil0/2f9673fa6b7f33f37cc5ea1cb728d7db to your computer and use it in GitHub Desktop.
⾏政区划代码爬虫
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import lxml
import json
def parse_page(url):
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
dict = {}
for tr in soup.find_all("table")[0].find_all("tr"):
try:
code = int(tr.find_all("td")[1].text)
name = tr.find_all("td")[2].text.strip()
dict[code] = name
except:
pass
return dict
if __name__ == "__main__":
dict = parse_page("http://www.mca.gov.cn/article/sj/xzqh/2020/2020/2020092500801.html")
res = []
for key, value in dict.items():
elem = {}
if key % 100 != 0:
province_code = key // 10000 * 10000
city_code = key // 100 * 100
name = dict[province_code]
if city_code in dict:
name += dict[city_code]
else:
city_code = key
name += dict[key]
elem["代码"] = key
elem["名称"] = name
elem["省级代码"] = province_code
elem["市级代码"] = city_code
elem["县级代码"] = key
res.append(elem)
print(json.dumps(res, ensure_ascii=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment