Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dodysw/9149929 to your computer and use it in GitHub Desktop.
Save dodysw/9149929 to your computer and use it in GitHub Desktop.
"""
Pull a table of Indonesia province (Propinsi), city/regency name and type (Kotamadya/Kabupaten), village name (Kelurahan/Desa), and their zip code into csv file
Dody Suria Wijaya 2014 dody@cryptolab.net
"""
import urllib2, re
row_per_page = 5000 # if set too high, the remote site can run out of RAM
max_rows = 100000 # to make sure this script eventually stop, last time I check, maximum rows are less than 79k.
output_filename = "out.csv"
def download(url):
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Google')]
response = opener.open(url)
return response.read()
def parse(buff):
rows = re.findall("<tr bgcolor=\"#ccffff\">" + "<td[^>]*>(.*?)</td>\s*" * 7 + "</tr>", buff)
cleanrows = []
for row in rows:
cleanrows.append([detagdirty(cell) for cell in row])
return cleanrows
def detagdirty(line):
return re.sub(r'<[^>]*?>', '', line)
def output(data):
import csv
with open(output_filename, 'wb') as f:
writer = csv.writer(f)
writer.writerows(data)
def main():
no1 = no2 = kk = 0
results = []
request_maxcount = max_rows / row_per_page + 1
for i in xrange(request_maxcount):
url = "http://kodepos.nomor.net/_kodepos.php?_i=desa-kodepos&perhal=%d&urut=&no1=%d&no2=%d&kk=%d" % (row_per_page, no1, no2, kk)
print url
content = download(url)
rows = parse(content)
print "Got", len(rows), "records"
if len(rows) == 0:
break
results += rows
kk += 1
no1 = row_per_page * (kk-1) + 1
no2 = no1 + row_per_page
output(results)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment