Skip to content

Instantly share code, notes, and snippets.

@melvyn-sopacua
Last active September 26, 2017 12:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save melvyn-sopacua/73f44e64bf8899490dafd299d62b2bea to your computer and use it in GitHub Desktop.
Save melvyn-sopacua/73f44e64bf8899490dafd299d62b2bea to your computer and use it in GitHub Desktop.
Get all cities in The Netherlands using official dataset and merge them into a single GML file.
import argparse
import os
import sys
import warnings
import requests
from lxml import etree
MAX_COUNT = 1000
MAX_RESULTS = 2501 # hardcoded to save us doing parsing of numberMatched
FILENAME_PREFIX = 'woonplaatsen'
FILENAME_EXT = 'gml'
DEFAULT_OUTFILE = FILENAME_PREFIX + '.' + FILENAME_EXT
SEPARATOR = '-'
BASE_URL = 'https://geodata.nationaalgeoregister.nl/bag/wfs'
DEFAULT_PARAMS = dict(request='GetFeature', service='wfs',
typeName='bag:woonplaats',
sortBy='bag:identificatie')
FILES = []
def download(opts):
si = 0
count = min(MAX_COUNT, opts.count)
with requests.session() as s:
while si < opts.max_results:
fname = '{pfx:s}{sep:s}c{cnt:04d}{sep:s}i{si:04d}.{ext:s}'.format(
pfx=FILENAME_PREFIX, sep=SEPARATOR, cnt=count, si=si,
ext=FILENAME_EXT,
)
if not os.path.exists(fname) or opts.overwrite:
params = DEFAULT_PARAMS.copy()
params.update(startIndex=si, count=count)
r = s.get(url=BASE_URL, params=params, stream=True)
if r.ok:
with open(fname, 'wt') as f:
f.write(r.content.decode('utf-8'))
if not opts.quiet:
print('--> {:s}'.format(fname))
else:
if not opts.quiet:
print('ERROR: {:d}: {:s}'.format(r.status_code, r.url),
file=sys.stderr)
else:
if not opts.quiet:
print('File exists: {:s}'.format(fname))
FILES.append(fname)
si += count
def merge(opts):
member_xpath_coll = './{http://www.opengis.net/wfs/2.0}member'
first = FILES.pop(0)
doc = etree.parse(first) # type: etree.ElementTree
root = doc.getroot() # type: etree.ElementBase
for file in FILES:
d = etree.parse(file)
root.extend(d.getroot().findall(member_xpath_coll))
root.set('numberReturned', '{:d}'.format(len(root)).encode('utf-8'))
# Shouldn't be in there, but just in case
if 'previous' in root.attrib:
if not opts.quiet:
warnings.warn(
'Previous member in first file pointing to: {:s}'.format(
doc.get('previous')
)
)
del root.attrib['previous']
# This should be
if 'next' in root.attrib:
del root.attrib['next']
if opts.outfile == '-' and not opts.quiet:
warnings.warn('Outfile set to -, but writing to stdout not supported')
warnings.warn('This will create a file called "-"')
with open(opts.outfile, 'wb') as f:
f.write(etree.tostring(doc, pretty_print=True))
if not opts.quiet:
print('Merged to {:s}'.format(opts.outfile))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Download and merge BAG woonplaatsen into one GML file.'
)
parser.add_argument('-c', '--count', metavar='batch_count',
help='Fetch at max this many per request.',
type=int, default=MAX_COUNT)
parser.add_argument('-F', '--overwrite', action='store_true',
default=False,
help='overwrite existing (previously '
'downloaded) files.')
parser.add_argument('-m', '--max-results', type=int,
dest='max_results',
default=MAX_RESULTS, metavar='result_count',
help='Use this for maximum results.')
parser.add_argument('-M', '--no-merge', action='store_true', default=False,
help='Only download files, do not merge',
dest='dontmerge')
parser.add_argument('-o', '--outfile', default=DEFAULT_OUTFILE,
help='Output file to write the merged file to.',
metavar='file')
parser.add_argument('-q', '--quiet', action='store_true',
dest='quiet',
default=False, help='provide no output')
args = parser.parse_args(sys.argv[1:])
download(args)
if not args.dontmerge:
merge(args)
else:
if args.outfile != DEFAULT_OUTFILE and not args.quiet:
warnings.warn('outfile argument provided but merge disabled')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment