andrewharvey/planet-replicate-find-next.pl

## planet-replicate-find-next.pl
#!/usr/bin/perl -w

# Inspect a local minute-replicate mirror and return the URL of the next diff file

# To the extent possible under law, the person who associated CC0
# with this work has waived all copyright and related or neighboring
# rights to this work.
# http://creativecommons.org/publicdomain/zero/1.0/

use strict;
use warnings;

use Log::Log4perl qw(:easy);
Log::Log4perl->easy_init($DEBUG); #DEBUG, INFO, WARN, ERROR, FATAL

# minute-replicate/A/B/C;

my $dir = "minute-replicate";

# open the minute-replicate directory
opendir(my $dh, $dir) || die("You need to have the $dir directory in your current working directory.\n");

# find all the sub-directories and get the largest number one
my @files = readdir($dh); closedir $dh;
my @files_sorted = sort @files;
my $largest_A = pop @files_sorted;

INFO "Largest A found is: $largest_A\n";

opendir(my $Adh, "$dir/$largest_A") || die;

# find all the sub-directories and get the largest number one
my @files_in_A = readdir($Adh); closedir $Adh;
my @files_in_A_sorted = sort @files_in_A;
my $largest_B = pop @files_in_A_sorted;

INFO "Largest B found is: $largest_B\n";

# next inside this directory, find all sub-files and find the largest number one
opendir(my $Bdh, "$dir/$largest_A/$largest_B") || die;

my @files_in_B = grep(/\.osc\.gz$/, readdir($Bdh));
closedir $Bdh;
my @files_in_B_sorted = sort @files_in_B;
my $largest_C = pop @files_in_B_sorted;

$largest_C =~ /^(\d+).osc.gz/;
$largest_C = $1;

INFO "Largest C found is: $largest_C\n";

# next determine the next expected file
my $next_A = $largest_A;
my $next_B;
my $next_C;

if ($largest_C == 999) {
  $next_C = 0;
  $next_B = $largest_B + 1;
}else{
  $next_C = $largest_C + 1;
  $next_B = $largest_B;
}

if ($next_B == 1000) {
  $next_A++;
  $next_B = 000;
}

print "$dir/".sprintf("%03d", $next_A)."/".sprintf("%03d", $next_B)."/".sprintf("%03d", $next_C).".osc.gz\n";

## replicate-fosm-changesets.sh
#!/bin/sh

# Author: Andrew Harvey <andrew.harvey4@gmail.com>
# License: CC0 http://creativecommons.org/publicdomain/zero/1.0/
#
# To the extent possible under law, the person who associated CC0
# with this work has waived all copyright and related or neighboring
# rights to this work.

# This script will replicate changesets pushed to fosm. The recommended way to
# invoke this script is via something like:
#    while sleep 2h; do replicate-fosm-changesets.sh; done

# You can start this script from a blank state, but it may be more efficient to
# kick start your local copy by running something like (where you replace
# 1000002000 with a number close to the largest changeset ID in use):
#    curl -o "head/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]"
#    curl -o "body/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]/download"

# We will always get the changeset/id (head) document which gives the changeset
# tags. We can additionally grab the changeset contents, i.e. the
# changeset/id/download (body) document. This is controlled by the
# DOWNLOAD_BODY variable. If you already have the minutly diffs you probably
# don't NEED the body as you have that information in your osc diff files.
DOWNLOAD_BODY=true
#DOWNLOAD_BODY=

# Where shall we save the data we download to?
SAVETO="/data/fosm/api/changeset"

# make the directories which we will save the data to
mkdir -p "${SAVETO}/head"
mkdir -p "${SAVETO}/body"

# find the last changeset id we have downloaded
LAST=`ls -1 "$SAVETO/head/" | sed 's/\.gz$//' | sort -n | tail -n 1`

# if we haven't actually got anything yet we shall start from the lowest fosm
# changeset id minus 1 (because we increment it later)
if [ ! $LAST ] ; then
  LAST=$(( 1000000001 - 1 ))
fi

# define a function to try to download the next changeset
tryNext() {
  NEXT=$(( $LAST + 1 ))
  echo "Trying to GET changeset/$NEXT..."
  curl --fail -o "${SAVETO}/head/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT"
  if [ $? -eq 22 ] ; then
    # HTTP page not retrieved
    echo "changeset/$NEXT not found. Exiting, try again later."
  else
    echo "...GOT changeset/$NEXT."

    # compress
    gzip "${SAVETO}/head/${NEXT}"

    if [ $DOWNLOAD_BODY ] ; then
      echo "Trying to GET changeset/$NEXT/download..."
      curl --fail -o "${SAVETO}/body/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT/download"

      if [ $? -eq 22 ] ; then
        echo "We got changeset/$NEXT, but failed to get changeset/$NEXT/download."
        echo "Removing the head and exiting so we can try again later."
        rm -f "${SAVETO}/head/${NEXT}"
        exit 1
      fi

      echo "...GOT changeset/$NEXT/download."

      # compress
      gzip "${SAVETO}/body/${NEXT}"
    fi
    LAST=$NEXT
    echo ""
    tryNext
  fi
}

tryNext

## replicate-fosm-osm2pgsql.sh
#!/bin/sh

# To the extent possible under law, the person who associated CC0
# with this work has waived all copyright and related or neighboring
# rights to this work.
# http://creativecommons.org/publicdomain/zero/1.0/

# This script brings your local fosm minute-replicate mirror up to date with the
# fosm server. It will keep pulling in changes until it you get up to the same
# point as the fosm server. Then from your local mirror it will patch your
# osm2pgsql fosm database with the latest changes.

# You can either invoke this script via cron and use something like run-one to
# avoid two instances of this script being run concurrently or,
# You can just run something like, (possibly with a keep-one-running wrapper)
# while sleep 120; do replicate-fosm-osm2pgsql.sh; done

# if you want to manually get an initial chunk of files (eg. to catch up to now) you may want to just use,
# curl --create-dirs -o minute-replicate/100/#1/#2.osc.gz http://fosm.org/planet/minute-replicate/100/[000-456]/[000-999].osc.gz
# after you get this initial chunk, you can load the chunk into PostgreSQL in bulk via,
# osm2pgsql --append --bbox [...] --slim minute-replicate/*/*/*.osc.gz

SCRIPT_DIR=`dirname $0`

# add your osm2pgsql arguments here (see man page for osm2pgsql for help)
OSM2PGSQL_ARGS="--append --slim"

##########################
## define our functions
##########################

# depending on your tileserver set up (if any) you may wish to expire or dirty
# cached tiles, my method is used here, but off by default
# to use my method you need to add the following arguments to OSM2PGSQL_ARGS
#   --expire-tiles 10-19 --expire-output expired-tiles-list
# , you also need to add expire-tilecache-disk.pl from
# https://gist.github.com/1170520 to the same directory as this script
expire_tiles() {
  # change to =true to turn on expire tiles function
  EXPIRE_TILES=
  if [ $EXPIRE_TILES ] ; then
    echo "expiring tiles"
    # the following script should work for mod_tile/renderd/tirex on disk caches
    # too, but I haven't tested it
    $SCRIPT_DIR/expire-tilecache-disk.pl expired-tiles-list /var/cache/tilecache/YOUR_LAYER/
    EXPIRE_TILES_EXIT_CODE=$?
    if [ $EXPIRE_TILES_EXIT_CODE -ne 0 ] ; then
      echo "failed to expire/dirty tiles ($EXPIRE_TILES_EXIT_CODE)"
      exit 1
    fi
  fi
}

# flush out our diff files which were postponed from osm2pgsql
flush_postponed() {
  if [ -e fosm-diff-postponed ] ; then
    echo "Flushing our backlog of postponed osc.gz files"

    # check we can pass this many arguments into the program on this system
    NUM_DIFF_FILES=`wc -l fosm-diff-postponed | cut -d' ' -f1`
    ARG_MAX=`getconf ARG_MAX`
    if [ $(($NUM_DIFF_FILES + 20)) -gt $ARG_MAX ] ; then # the 20 is a safety net for $OSM2PGSQL_ARGS
      echo "can't flush the backlog: too many postponed diff files to fit in one call to osm2pgsql"
      exit 1
    fi

    cat fosm-diff-postponed | xargs osm2pgsql $OSM2PGSQL_ARGS
    POSTPONED_OSM2PGSQL_EXIT_CODE=$?
    if [ $POSTPONED_OSM2PGSQL_EXIT_CODE -ne 0 ] ; then
      echo "osm2pgsql failed while flushing the backlog, leaving fosm-diff-postponed"
      exit 1
    else
      rm -f fosm-diff-postponed
      expire_tiles
    fi
  fi
}

try_next() {
  # find the URL of the next osc file
  NEXT_URL=`$SCRIPT_DIR/planet-replicate-find-next.pl`
  FIND_NEXT_EXIT_CODE=$?
  if [ $FIND_NEXT_EXIT_CODE -ne 0 ] ; then
    echo "planet-replicate-find-next.pl failed ($FIND_NEXT_EXIT_CODE) so we are stopping now also"
    exit $FIND_NEXT_EXIT_CODE
  fi

  curl --fail --create-dirs -o "$NEXT_URL" "http://fosm.org/planet/$NEXT_URL"
  CURL_EXIT_CODE=$?
  if [ $CURL_EXIT_CODE -eq 22 ] ; then
    # curl didn't retrieve the file, most likely there are no more osc files yet
    echo "curl $NEXT_URL reached end of osc files ($CURL_EXIT_CODE)"
  elif [ $CURL_EXIT_CODE -ne 0 ] ; then
    # curl failed to get file, something went wrong
    echo "curl $NEXT_URL failed ($CURL_EXIT_CODE)"
  else
    echo "GOT $NEXT_URL"
    if [ $POSTPONE ] ; then
      echo "$NEXT_URL" >> fosm-diff-postponed
      try_next
    else
      osm2pgsql $OSM2PGSQL_ARGS "$NEXT_URL"
      OSM2PGSQL_EXIT_CODE=$?
      if [ $OSM2PGSQL_EXIT_CODE -ne 0 ] ; then
        echo "osm2pgsql failed for $NEXT_URL ($OSM2PGSQL_EXIT_CODE)"
        if [ $OSM2PGSQL_EXIT_CODE -eq 137 ] ; then
          echo "   osm2pgsql received the KILL signal, probably not enough memory"
        fi
        exit $OSM2PGSQL_EXIT_CODE
      else
        try_next
      fi
    fi
  fi
}

##########################
## main function
##########################

# run the flush before we start in case we didn't cleanly finish last time
flush_postponed

if [ "$1" = "--postpone" ] ; then
  # withhold loading downloaded files into osm2pgsql right now
  # instead, add the file we'll download to a backlog list
  POSTPONE=true
else
  # load each osc file into postgres via osm2pgsql individually as soon as we
  # have downloaded the osc file
  POSTPONE=
fi

# try to download the next osc file and either load it into postgres or add it
# to a postponed list
try_next

# clear out the postponed list by loading all osc files into postgres
flush_postponed
	#!/usr/bin/perl -w

	# Inspect a local minute-replicate mirror and return the URL of the next diff file

	# To the extent possible under law, the person who associated CC0
	# with this work has waived all copyright and related or neighboring
	# rights to this work.
	# http://creativecommons.org/publicdomain/zero/1.0/

	use strict;
	use warnings;

	use Log::Log4perl qw(:easy);
	Log::Log4perl->easy_init($DEBUG); #DEBUG, INFO, WARN, ERROR, FATAL

	# minute-replicate/A/B/C;

	my $dir = "minute-replicate";

	# open the minute-replicate directory
	opendir(my $dh, $dir) \|\| die("You need to have the $dir directory in your current working directory.\n");

	# find all the sub-directories and get the largest number one
	my @files = readdir($dh); closedir $dh;
	my @files_sorted = sort @files;
	my $largest_A = pop @files_sorted;

	INFO "Largest A found is: $largest_A\n";

	opendir(my $Adh, "$dir/$largest_A") \|\| die;

	# find all the sub-directories and get the largest number one
	my @files_in_A = readdir($Adh); closedir $Adh;
	my @files_in_A_sorted = sort @files_in_A;
	my $largest_B = pop @files_in_A_sorted;

	INFO "Largest B found is: $largest_B\n";

	# next inside this directory, find all sub-files and find the largest number one
	opendir(my $Bdh, "$dir/$largest_A/$largest_B") \|\| die;

	my @files_in_B = grep(/\.osc\.gz$/, readdir($Bdh));
	closedir $Bdh;
	my @files_in_B_sorted = sort @files_in_B;
	my $largest_C = pop @files_in_B_sorted;

	$largest_C =~ /^(\d+).osc.gz/;
	$largest_C = $1;

	INFO "Largest C found is: $largest_C\n";

	# next determine the next expected file
	my $next_A = $largest_A;
	my $next_B;
	my $next_C;

	if ($largest_C == 999) {
	$next_C = 0;
	$next_B = $largest_B + 1;
	}else{
	$next_C = $largest_C + 1;
	$next_B = $largest_B;
	}

	if ($next_B == 1000) {
	$next_A++;
	$next_B = 000;
	}

	print "$dir/".sprintf("%03d", $next_A)."/".sprintf("%03d", $next_B)."/".sprintf("%03d", $next_C).".osc.gz\n";
	#!/bin/sh

	# Author: Andrew Harvey <andrew.harvey4@gmail.com>
	# License: CC0 http://creativecommons.org/publicdomain/zero/1.0/
	#
	# To the extent possible under law, the person who associated CC0
	# with this work has waived all copyright and related or neighboring
	# rights to this work.

	# This script will replicate changesets pushed to fosm. The recommended way to
	# invoke this script is via something like:
	# while sleep 2h; do replicate-fosm-changesets.sh; done

	# You can start this script from a blank state, but it may be more efficient to
	# kick start your local copy by running something like (where you replace
	# 1000002000 with a number close to the largest changeset ID in use):
	# curl -o "head/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]"
	# curl -o "body/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]/download"

	# We will always get the changeset/id (head) document which gives the changeset
	# tags. We can additionally grab the changeset contents, i.e. the
	# changeset/id/download (body) document. This is controlled by the
	# DOWNLOAD_BODY variable. If you already have the minutly diffs you probably
	# don't NEED the body as you have that information in your osc diff files.
	DOWNLOAD_BODY=true
	#DOWNLOAD_BODY=

	# Where shall we save the data we download to?
	SAVETO="/data/fosm/api/changeset"

	# make the directories which we will save the data to
	mkdir -p "${SAVETO}/head"
	mkdir -p "${SAVETO}/body"

	# find the last changeset id we have downloaded
	LAST=`ls -1 "$SAVETO/head/" \| sed 's/\.gz$//' \| sort -n \| tail -n 1`

	# if we haven't actually got anything yet we shall start from the lowest fosm
	# changeset id minus 1 (because we increment it later)
	if [ ! $LAST ] ; then
	LAST=$(( 1000000001 - 1 ))
	fi

	# define a function to try to download the next changeset
	tryNext() {
	NEXT=$(( $LAST + 1 ))
	echo "Trying to GET changeset/$NEXT..."
	curl --fail -o "${SAVETO}/head/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT"
	if [ $? -eq 22 ] ; then
	# HTTP page not retrieved
	echo "changeset/$NEXT not found. Exiting, try again later."
	else
	echo "...GOT changeset/$NEXT."

	# compress
	gzip "${SAVETO}/head/${NEXT}"

	if [ $DOWNLOAD_BODY ] ; then
	echo "Trying to GET changeset/$NEXT/download..."
	curl --fail -o "${SAVETO}/body/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT/download"

	if [ $? -eq 22 ] ; then
	echo "We got changeset/$NEXT, but failed to get changeset/$NEXT/download."
	echo "Removing the head and exiting so we can try again later."
	rm -f "${SAVETO}/head/${NEXT}"
	exit 1
	fi

	echo "...GOT changeset/$NEXT/download."

	# compress
	gzip "${SAVETO}/body/${NEXT}"
	fi
	LAST=$NEXT
	echo ""
	tryNext
	fi
	}

	tryNext