dodysw/build_indonesia_province_regency_district_csv.py

## build_indonesia_province_regency_district_csv.py
"""
Pull a table of Indonesia province (Propinsi), city/regency name and type (Kotamadya/Kabupaten), village name (Kelurahan/Desa), and their zip code into csv file
Dody Suria Wijaya 2014 dody@cryptolab.net
"""

import urllib2, re

row_per_page = 5000 # if set too high, the remote site can run out of RAM
max_rows = 100000    # to make sure this script eventually stop, last time I check, maximum rows are less than 79k.
output_filename = "out.csv"

def download(url):
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Google')]
    response = opener.open(url)
    return response.read()

def parse(buff):
    rows = re.findall("<tr bgcolor=\"#ccffff\">" + "<td[^>]*>(.*?)</td>\s*" * 7 + "</tr>", buff)
    cleanrows = []
    for row in rows:
        cleanrows.append([detagdirty(cell) for cell in row])
    return cleanrows

def detagdirty(line):
    return re.sub(r'<[^>]*?>', '', line)

def output(data):
    import csv
    with open(output_filename, 'wb') as f:
        writer  = csv.writer(f)
        writer.writerows(data)

def main():
    no1 = no2 = kk = 0
    results = []
    request_maxcount = max_rows / row_per_page + 1

    for i in xrange(request_maxcount):
        url = "http://kodepos.nomor.net/_kodepos.php?_i=desa-kodepos&perhal=%d&urut=&no1=%d&no2=%d&kk=%d" % (row_per_page, no1, no2, kk)
        print url
        content = download(url)
        rows = parse(content)
        print "Got", len(rows), "records"
        if len(rows) == 0:
            break

        results += rows

        kk += 1
        no1 = row_per_page * (kk-1) + 1
        no2 = no1 + row_per_page

    output(results)

if __name__ == "__main__":
    main()
	"""
	Pull a table of Indonesia province (Propinsi), city/regency name and type (Kotamadya/Kabupaten), village name (Kelurahan/Desa), and their zip code into csv file
	Dody Suria Wijaya 2014 dody@cryptolab.net
	"""

	import urllib2, re

	row_per_page = 5000 # if set too high, the remote site can run out of RAM
	max_rows = 100000 # to make sure this script eventually stop, last time I check, maximum rows are less than 79k.
	output_filename = "out.csv"

	def download(url):
	opener = urllib2.build_opener()
	opener.addheaders = [('User-agent', 'Google')]
	response = opener.open(url)
	return response.read()

	def parse(buff):
	rows = re.findall("<tr bgcolor=\"#ccffff\">" + "<td[^>]>(.?)</td>\s" 7 + "</tr>", buff)
	cleanrows = []
	for row in rows:
	cleanrows.append([detagdirty(cell) for cell in row])
	return cleanrows

	def detagdirty(line):
	return re.sub(r'<[^>]*?>', '', line)

	def output(data):
	import csv
	with open(output_filename, 'wb') as f:
	writer = csv.writer(f)
	writer.writerows(data)

	def main():
	no1 = no2 = kk = 0
	results = []
	request_maxcount = max_rows / row_per_page + 1

	for i in xrange(request_maxcount):
	url = "http://kodepos.nomor.net/_kodepos.php?_i=desa-kodepos&perhal=%d&urut=&no1=%d&no2=%d&kk=%d" % (row_per_page, no1, no2, kk)
	print url
	content = download(url)
	rows = parse(content)
	print "Got", len(rows), "records"
	if len(rows) == 0:
	break

	results += rows

	kk += 1
	no1 = row_per_page * (kk-1) + 1
	no2 = no1 + row_per_page

	output(results)

	if __name__ == "__main__":
	main()