Skip to content

Instantly share code, notes, and snippets.

@andrewharvey
Created July 8, 2011 22:44
Show Gist options
  • Save andrewharvey/1073001 to your computer and use it in GitHub Desktop.
Save andrewharvey/1073001 to your computer and use it in GitHub Desktop.
Keeps a local minute-replicate mirror of fosm.org osc files
#!/usr/bin/perl -w
# Inspect a local minute-replicate mirror and return the URL of the next diff file
# To the extent possible under law, the person who associated CC0
# with this work has waived all copyright and related or neighboring
# rights to this work.
# http://creativecommons.org/publicdomain/zero/1.0/
use strict;
use warnings;
use Log::Log4perl qw(:easy);
Log::Log4perl->easy_init($DEBUG); #DEBUG, INFO, WARN, ERROR, FATAL
# minute-replicate/A/B/C;
my $dir = "minute-replicate";
# open the minute-replicate directory
opendir(my $dh, $dir) || die("You need to have the $dir directory in your current working directory.\n");
# find all the sub-directories and get the largest number one
my @files = readdir($dh); closedir $dh;
my @files_sorted = sort @files;
my $largest_A = pop @files_sorted;
INFO "Largest A found is: $largest_A\n";
opendir(my $Adh, "$dir/$largest_A") || die;
# find all the sub-directories and get the largest number one
my @files_in_A = readdir($Adh); closedir $Adh;
my @files_in_A_sorted = sort @files_in_A;
my $largest_B = pop @files_in_A_sorted;
INFO "Largest B found is: $largest_B\n";
# next inside this directory, find all sub-files and find the largest number one
opendir(my $Bdh, "$dir/$largest_A/$largest_B") || die;
my @files_in_B = grep(/\.osc\.gz$/, readdir($Bdh));
closedir $Bdh;
my @files_in_B_sorted = sort @files_in_B;
my $largest_C = pop @files_in_B_sorted;
$largest_C =~ /^(\d+).osc.gz/;
$largest_C = $1;
INFO "Largest C found is: $largest_C\n";
# next determine the next expected file
my $next_A = $largest_A;
my $next_B;
my $next_C;
if ($largest_C == 999) {
$next_C = 0;
$next_B = $largest_B + 1;
}else{
$next_C = $largest_C + 1;
$next_B = $largest_B;
}
if ($next_B == 1000) {
$next_A++;
$next_B = 000;
}
print "$dir/".sprintf("%03d", $next_A)."/".sprintf("%03d", $next_B)."/".sprintf("%03d", $next_C).".osc.gz\n";
#!/bin/sh
# Author: Andrew Harvey <andrew.harvey4@gmail.com>
# License: CC0 http://creativecommons.org/publicdomain/zero/1.0/
#
# To the extent possible under law, the person who associated CC0
# with this work has waived all copyright and related or neighboring
# rights to this work.
# This script will replicate changesets pushed to fosm. The recommended way to
# invoke this script is via something like:
# while sleep 2h; do replicate-fosm-changesets.sh; done
# You can start this script from a blank state, but it may be more efficient to
# kick start your local copy by running something like (where you replace
# 1000002000 with a number close to the largest changeset ID in use):
# curl -o "head/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]"
# curl -o "body/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]/download"
# We will always get the changeset/id (head) document which gives the changeset
# tags. We can additionally grab the changeset contents, i.e. the
# changeset/id/download (body) document. This is controlled by the
# DOWNLOAD_BODY variable. If you already have the minutly diffs you probably
# don't NEED the body as you have that information in your osc diff files.
DOWNLOAD_BODY=true
#DOWNLOAD_BODY=
# Where shall we save the data we download to?
SAVETO="/data/fosm/api/changeset"
# make the directories which we will save the data to
mkdir -p "${SAVETO}/head"
mkdir -p "${SAVETO}/body"
# find the last changeset id we have downloaded
LAST=`ls -1 "$SAVETO/head/" | sed 's/\.gz$//' | sort -n | tail -n 1`
# if we haven't actually got anything yet we shall start from the lowest fosm
# changeset id minus 1 (because we increment it later)
if [ ! $LAST ] ; then
LAST=$(( 1000000001 - 1 ))
fi
# define a function to try to download the next changeset
tryNext() {
NEXT=$(( $LAST + 1 ))
echo "Trying to GET changeset/$NEXT..."
curl --fail -o "${SAVETO}/head/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT"
if [ $? -eq 22 ] ; then
# HTTP page not retrieved
echo "changeset/$NEXT not found. Exiting, try again later."
else
echo "...GOT changeset/$NEXT."
# compress
gzip "${SAVETO}/head/${NEXT}"
if [ $DOWNLOAD_BODY ] ; then
echo "Trying to GET changeset/$NEXT/download..."
curl --fail -o "${SAVETO}/body/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT/download"
if [ $? -eq 22 ] ; then
echo "We got changeset/$NEXT, but failed to get changeset/$NEXT/download."
echo "Removing the head and exiting so we can try again later."
rm -f "${SAVETO}/head/${NEXT}"
exit 1
fi
echo "...GOT changeset/$NEXT/download."
# compress
gzip "${SAVETO}/body/${NEXT}"
fi
LAST=$NEXT
echo ""
tryNext
fi
}
tryNext
#!/bin/sh
# To the extent possible under law, the person who associated CC0
# with this work has waived all copyright and related or neighboring
# rights to this work.
# http://creativecommons.org/publicdomain/zero/1.0/
# This script brings your local fosm minute-replicate mirror up to date with the
# fosm server. It will keep pulling in changes until it you get up to the same
# point as the fosm server. Then from your local mirror it will patch your
# osm2pgsql fosm database with the latest changes.
# You can either invoke this script via cron and use something like run-one to
# avoid two instances of this script being run concurrently or,
# You can just run something like, (possibly with a keep-one-running wrapper)
# while sleep 120; do replicate-fosm-osm2pgsql.sh; done
# if you want to manually get an initial chunk of files (eg. to catch up to now) you may want to just use,
# curl --create-dirs -o minute-replicate/100/#1/#2.osc.gz http://fosm.org/planet/minute-replicate/100/[000-456]/[000-999].osc.gz
# after you get this initial chunk, you can load the chunk into PostgreSQL in bulk via,
# osm2pgsql --append --bbox [...] --slim minute-replicate/*/*/*.osc.gz
SCRIPT_DIR=`dirname $0`
# add your osm2pgsql arguments here (see man page for osm2pgsql for help)
OSM2PGSQL_ARGS="--append --slim"
##########################
## define our functions
##########################
# depending on your tileserver set up (if any) you may wish to expire or dirty
# cached tiles, my method is used here, but off by default
# to use my method you need to add the following arguments to OSM2PGSQL_ARGS
# --expire-tiles 10-19 --expire-output expired-tiles-list
# , you also need to add expire-tilecache-disk.pl from
# https://gist.github.com/1170520 to the same directory as this script
expire_tiles() {
# change to =true to turn on expire tiles function
EXPIRE_TILES=
if [ $EXPIRE_TILES ] ; then
echo "expiring tiles"
# the following script should work for mod_tile/renderd/tirex on disk caches
# too, but I haven't tested it
$SCRIPT_DIR/expire-tilecache-disk.pl expired-tiles-list /var/cache/tilecache/YOUR_LAYER/
EXPIRE_TILES_EXIT_CODE=$?
if [ $EXPIRE_TILES_EXIT_CODE -ne 0 ] ; then
echo "failed to expire/dirty tiles ($EXPIRE_TILES_EXIT_CODE)"
exit 1
fi
fi
}
# flush out our diff files which were postponed from osm2pgsql
flush_postponed() {
if [ -e fosm-diff-postponed ] ; then
echo "Flushing our backlog of postponed osc.gz files"
# check we can pass this many arguments into the program on this system
NUM_DIFF_FILES=`wc -l fosm-diff-postponed | cut -d' ' -f1`
ARG_MAX=`getconf ARG_MAX`
if [ $(($NUM_DIFF_FILES + 20)) -gt $ARG_MAX ] ; then # the 20 is a safety net for $OSM2PGSQL_ARGS
echo "can't flush the backlog: too many postponed diff files to fit in one call to osm2pgsql"
exit 1
fi
cat fosm-diff-postponed | xargs osm2pgsql $OSM2PGSQL_ARGS
POSTPONED_OSM2PGSQL_EXIT_CODE=$?
if [ $POSTPONED_OSM2PGSQL_EXIT_CODE -ne 0 ] ; then
echo "osm2pgsql failed while flushing the backlog, leaving fosm-diff-postponed"
exit 1
else
rm -f fosm-diff-postponed
expire_tiles
fi
fi
}
try_next() {
# find the URL of the next osc file
NEXT_URL=`$SCRIPT_DIR/planet-replicate-find-next.pl`
FIND_NEXT_EXIT_CODE=$?
if [ $FIND_NEXT_EXIT_CODE -ne 0 ] ; then
echo "planet-replicate-find-next.pl failed ($FIND_NEXT_EXIT_CODE) so we are stopping now also"
exit $FIND_NEXT_EXIT_CODE
fi
curl --fail --create-dirs -o "$NEXT_URL" "http://fosm.org/planet/$NEXT_URL"
CURL_EXIT_CODE=$?
if [ $CURL_EXIT_CODE -eq 22 ] ; then
# curl didn't retrieve the file, most likely there are no more osc files yet
echo "curl $NEXT_URL reached end of osc files ($CURL_EXIT_CODE)"
elif [ $CURL_EXIT_CODE -ne 0 ] ; then
# curl failed to get file, something went wrong
echo "curl $NEXT_URL failed ($CURL_EXIT_CODE)"
else
echo "GOT $NEXT_URL"
if [ $POSTPONE ] ; then
echo "$NEXT_URL" >> fosm-diff-postponed
try_next
else
osm2pgsql $OSM2PGSQL_ARGS "$NEXT_URL"
OSM2PGSQL_EXIT_CODE=$?
if [ $OSM2PGSQL_EXIT_CODE -ne 0 ] ; then
echo "osm2pgsql failed for $NEXT_URL ($OSM2PGSQL_EXIT_CODE)"
if [ $OSM2PGSQL_EXIT_CODE -eq 137 ] ; then
echo " osm2pgsql received the KILL signal, probably not enough memory"
fi
exit $OSM2PGSQL_EXIT_CODE
else
try_next
fi
fi
fi
}
##########################
## main function
##########################
# run the flush before we start in case we didn't cleanly finish last time
flush_postponed
if [ "$1" = "--postpone" ] ; then
# withhold loading downloaded files into osm2pgsql right now
# instead, add the file we'll download to a backlog list
POSTPONE=true
else
# load each osc file into postgres via osm2pgsql individually as soon as we
# have downloaded the osc file
POSTPONE=
fi
# try to download the next osc file and either load it into postgres or add it
# to a postponed list
try_next
# clear out the postponed list by loading all osc files into postgres
flush_postponed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment