Skip to content

Instantly share code, notes, and snippets.

@arbakker
Last active March 11, 2022 10:49
Show Gist options
  • Save arbakker/e158d992558f23edd6512b382088987f to your computer and use it in GitHub Desktop.
Save arbakker/e158d992558f23edd6512b382088987f to your computer and use it in GitHub Desktop.
Bash script for scraping WFS services in a responsible way
#!/usr/bin/env bash
# Bash script for scraping WFS services in a responsible way. Script divides in area of interest in list of bounding boxes, which are requested sequentially. The HTTP get request is configured to retry, using the default CURL exponential backoff algorithm.
# Only use when no bulk download (like ATOM) service is available.
set -eu
TARGET_GPKG=output.gpkg
WFS_URL="https://service.pdok.nl/prorail/spoorwegen/wfs/v1_0"
FT_NAME=spoorwegen:kilometrering
LAYER_NAME=$(cut -d":" -f2 <<<$FT_NAME)
SLEEP=1
# extent nl (vasteland): 6500,310000,290000,620000 - RD coordinaten in meters
NL_MINX=6500
NL_MINY=310000
NL_MAXX=290000
NL_MAXY=620000
GRID_SIZE=10000 # 10 KM > resulteert in 928 bboxen
function get_features() {
bbox="$1"
page_size=1000
i=0
while true; do # loop until no more features are returned by WFS (wfs hit count is not reliable AFAIK)
uuid=$(uuidgen)
start_index=$(bc <<<"$page_size*$i")
url="${WFS_URL}?SERVICE=WFS&VERSION=2.0.0&bbox=${bbox}&REQUEST=GetFeature&TYPENAMES=${FT_NAME}&COUNT=${page_size}&STARTINDEX=${start_index}"
tmp_gml="/tmp/${uuid}.gml"
echo "requesting ${url}"
curl -A "pdok-wfs-download-script" -sv --retry-max-time 60 --retry 10 --retry-connrefused --connect-timeout 10 "$url" >"$tmp_gml"
ft_count=$(ogrinfo "$tmp_gml" "$LAYER_NAME" -so | grep "Feature Count" | cut -d" " -f 3)
if [[ $ft_count -eq 0 ]]; then
rm -f "$tmp_gml"
rm -f "${uuid}.gfs" # ogr generates gfs file when opening gml
break
else
append=""
if [[ -f "$TARGET_GPKG" ]]; then
append="-append"
fi
ogr2ogr -f GPKG $append "$TARGET_GPKG" "$tmp_gml" "$LAYER_NAME"
rm -f "$tmp_gml"
fi
i=$(bc <<<"$i+1")
done
}
for minx in $(# overshoot van 1 grid cell in het geval er geen remainder is in (nl_maxx-nl_minx)/GRID_SIZE
seq $NL_MINX $GRID_SIZE $NL_MAXX
); do
for miny in $(seq $NL_MINY $GRID_SIZE $NL_MAXY); do
maxx=$(bc <<<"$minx+$GRID_SIZE")
maxy=$(bc <<<"$miny+$GRID_SIZE")
bbox="$minx,$miny,$maxx,$maxy"
ft_count=0
if [[ -f "$TARGET_GPKG" ]]; then
ft_count=$(ogrinfo "$TARGET_GPKG" "$LAYER_NAME" -spat $(tr "," " " <<<"$bbox") -so | grep "Feature
Count" | cut -d" " -f 3)
fi
if [[ $ft_count -eq 0 ]]; then
get_features "$bbox"
sleep $SLEEP
else
echo "FEATURE COUNT: ${ft_count}, BBOX: ${bbox}. SKIPPING WFS REQUEST."
fi
done
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment