Skip to content

Instantly share code, notes, and snippets.

@XimingCheng
Created December 15, 2014 01:56
Show Gist options
  • Save XimingCheng/358b6305649dcea9c257 to your computer and use it in GitHub Desktop.
Save XimingCheng/358b6305649dcea9c257 to your computer and use it in GitHub Desktop.
A python script for getting the location data from National Bureau of Statistics of the People's Republic of China
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup
import urllib2
import time
import sys
import socket
def getProvice(base_url):
try:
#time.sleep(5)
request = urllib2.Request(base_url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14')
page = urllib2.urlopen(request, timeout = 100)
except urllib2.URLError as e:
sys.stderr.write("!!!!!!!!!\n")
except socket.timeout as e:
sys.stderr.write("!!!!!!!!!\n")
request = urllib2.Request(base_url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14')
page = urllib2.urlopen(request, timeout = 100)
soup = BeautifulSoup(page, fromEncoding="gb18030")
pro_nodes = soup.findAll(attrs = {"class" : "provincetr"})
pro_data = []
for pro_list in pro_nodes:
for pro in pro_list.contents:
proname = pro.a.contents[0]
prolink = pro.a.attrs[0][1]
code = prolink[0 : -5]
data = (proname, code, prolink)
pro_data.append(data)
return pro_data
def getCity(base_url):
try:
#time.sleep(5)
request = urllib2.Request(base_url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14')
page = urllib2.urlopen(request, timeout = 100)
except urllib2.URLError as e:
sys.stderr.write("!!!!!!!!!\n")
except socket.timeout as e:
sys.stderr.write("!!!!!!!!!\n")
request = urllib2.Request(base_url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14')
page = urllib2.urlopen(request, timeout = 100)
soup = BeautifulSoup(page, fromEncoding="gb18030")
city_nodes = soup.findAll(attrs = {"class" : "citytr"})
city_data = []
for city in city_nodes:
fullcode = city.a.contents[0]
code = fullcode[0 : 4]
cityname = city.contents[1].a.contents[0]
link = city.a.attrs[0][1]
data = (cityname, code, link)
city_data.append(data)
return city_data
def getDistrict(base_url):
try:
#time.sleep(5)
request = urllib2.Request(base_url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14')
page = urllib2.urlopen(request, timeout = 100)
except urllib2.URLError as e:
sys.stderr.write("!!!!!!!!!\n")
except socket.timeout as e:
sys.stderr.write("!!!!!!!!!\n")
request = urllib2.Request(base_url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14')
page = urllib2.urlopen(request, timeout = 100)
soup = BeautifulSoup(page, fromEncoding="gb18030")
district_nodes = soup.findAll(attrs = {"class" : "countytr"})
district_data = []
if len(district_nodes) > 0:
for district in district_nodes:
if district.a:
fullcode = district.a.contents[0]
code = fullcode[0 : 6]
districtname = district.contents[1].a.contents[0]
link = district.a.attrs[0][1]
data = (districtname, code, link, True)
district_data.append(data)
else:
fullcode = district.contents[0].contents[0]
code = fullcode[0 : 6]
districtname = district.contents[1].contents[0]
link = None
data = (districtname, code, link, True)
district_data.append(data)
else:
data_street = getStreet(base_url)
for street in data_street:
data = (street[0], street[1][0 : 9], None, False)
#print data
district_data.append(data)
return district_data
def getStreet(base_url):
sys.stderr.write(base_url + "\n")
try:
#time.sleep(5)
request = urllib2.Request(base_url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14')
page = urllib2.urlopen(request, timeout = 100)
except urllib2.URLError as e:
sys.stderr.write("!!!!!!!!!\n")
except socket.timeout as e:
sys.stderr.write("!!!!!!!!!\n")
request = urllib2.Request(base_url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14')
page = urllib2.urlopen(request, timeout = 100)
soup = BeautifulSoup(page, fromEncoding="gb18030")
street_nodes = soup.findAll(attrs = {"class" : "towntr"})
street_data = []
for street in street_nodes:
fullcode = street.a.contents[0]
code = fullcode[0 : 9]
streetname = street.contents[1].a.contents[0]
data = (streetname, code)
street_data.append(data)
return street_data
def genSQL(table, name, code, count, parent, level, time):
count = str(count)
head = "NULL"
head_map = {"11" : "B", "12" : "T", "13" : "H", "14" : "S",
"15" : "N", "21" : "L", "22" : "J", "23" : "H", "31" : "S",
"32" : "J", "33" : "Z", "34" : "A", "35" : "F", "36" : "J",
"37" : "S", "41" : "H", "42" : "H", "43" : "H", "44" : "G",
"45" : "G", "46" : "H", "50" : "C", "51" : "S", "52" : "G",
"53" : "Y", "54" : "X", "61" : "S", "62" : "G", "63" : "Q",
"64" : "L", "65" : "X", "71" : "T", "81" : "X", "82" : "A"}
if head_map.has_key(code):
head = head_map[code]
sql = r"INSERT INTO `" + table + "` VALUES (" + count + ",1, " + code
sql += (r", " + level + ", '" + name + r"', " + parent + r", " + time + r", ")
if head == "NULL":
sql += r"NULL);"
else:
sql += (r"'" + head + r"');")
print sql.encode('gbk')
def main():
# for district in district_data:
# new_node = district[1][0 : 6]
# print district
# sys.stderr.write(new_node + "\n")
# count += 1
# genSQL(table, u"区", new_node, count, city[1], "3", timec)
# count += 1
# genSQL(table, district[0], district[1], count, new_node, "4", timec)
pro_base_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/"
table = 'TB_IP_LocationCode'
count = 0
timec = str(int(1000 * time.time()))
pro_data = getProvice(pro_base_url)
pro_data = pro_data[35 : 40]
for pro in pro_data:
count += 1
genSQL(table, pro[0], pro[1], count, "0", "1", timec)
prolink = pro[2]
city_data = getCity(pro_base_url + prolink)
for city in city_data:
count += 1
genSQL(table, city[0], city[1], count, pro[1], "2", timec)
citylink = city[2]
district_data = getDistrict(pro_base_url + citylink)
for district in district_data:
if district[3]: # district
count += 1
genSQL(table, district[0], district[1], count, city[1], "3", timec)
district_link = district[2]
if district_link:
street_data = getStreet(pro_base_url + pro[1] + '/' + district_link)
for street in street_data:
#print street[0], street[1]
count += 1
genSQL(table, street[0], street[1], count, district[1], "4", timec)
else:
new_node = district[1][0 : 6]
sys.stderr.write(new_node + "\n")
count += 1
genSQL(table, u"区", new_node, count, city[1], "3", timec)
count += 1
genSQL(table, district[0], district[1], count, new_node, "4", timec)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment