sixy6e/README.md

## README.md

      
    Raw
  

              README.md
            
          
    A baresoils percentiles mosaicing workflow. Just to be used as a computational baseline.

  
## baresoil_percentile_baseline.py
#!/usr/bin/env python

from datetime import date
from os.path import join as pjoin
import gdal
import luigi
import numpy
import numexpr

from datacube.api.model import DatasetType, Fc25Bands, Ls57Arg25Bands
from datacube.api.model import Pq25Bands
from datacube.api.model import Satellite
from datacube.api.query import list_tiles_as_list
from datacube.config import Config
from datacube.api.utils import get_dataset_metadata, get_mask_pqa
from datacube.api.utils import NDV, calculate_ndvi#, date_to_integer
from datacube.api.utils import get_dataset_data, get_mask_wofs
from eotools.tiling import generate_tiles
from eotools.tiling import TiledOutput
from eotools.geobox import GriddedGeoBox
from eotools.pq_utils import extract_pq_flags
from eotools.pq_utils import pq_apply_dict


def bs_workflow(tiles, percentile=90, xtile=None, ytile=None,
                out_fnames=None):
    """
    A baseline workflow for doing the baresoil percentile, NBAR, FC
    corresponding mosaics.
    """
    # Get some basic image info
    ds_type = DatasetType.FC25
    ds = tiles[0]
    dataset = ds.datasets[ds_type]
    md = get_dataset_metadata(dataset)
    samples, lines = md.shape
    time_slices = len(tiles)
    geobox = GriddedGeoBox.from_gdal_dataset(gdal.Open(dataset.path))

    # Initialise the tiling scheme for processing
    if xtile is None:
        xtile = samples
    if ytile is None:
        ytile = lines
    chunks = generate_tiles(samples, lines, xtile=samples, ytile=ytile,
                            generator=False)

    # Define no-data
    no_data_value = NDV
    nan = numpy.float32(numpy.nan) # for the FC dtype no need for float64

    # Define the output files
    if out_fnames is None:
        nbar_outfname = 'nbar_best_pixel'
        fc_outfname = 'fc_best_pixel'
        sat_outfname = 'sat_best_pixel'
        date_outfnme = 'date_best_pixel'
    else:
        nbar_outfname = out_fnames[0]
        fc_outfname = out_fnames[1]
        sat_outfname = out_fnames[2]
        date_outfnme = out_fnames[3]

    nbar_outnb = len(Ls57Arg25Bands)
    fc_outnb = len(Fc25Bands)
    out_dtype = gdal.GDT_Int16

    nbar_outds = TiledOutput(nbar_outfname, samples=samples, lines=lines,
                             bands=nbar_outnb, dtype=out_dtype,
                             nodata=no_data_value, geobox=geobox)
    fc_outds = TiledOutput(fc_outfname, samples=samples, lines=lines,
                           bands=fc_outnb, dtype=out_dtype,
                           nodata=no_data_value, geobox=geobox)
    sat_outds = TiledOutput(sat_outfname, samples=samples, lines=lines,
                            dtype=out_dtype, nodata=no_data_value,
                            geobox=geobox)
    date_outds = TiledOutput(date_outfnme, samples=samples, lines=lines,
                             dtype=gdal.GDT_Float64, nodata=no_data_value,
                             geobox=geobox)

    satellite_code = {Satellite.LS5: 5, Satellite.LS7: 7, Satellite.LS8: 8}
    fc_bands_subset = [Fc25Bands.PHOTOSYNTHETIC_VEGETATION,
                       Fc25Bands.NON_PHOTOSYNTHETIC_VEGETATION,
                       Fc25Bands.UNMIXING_ERROR]

    # Loop over each spatial tile/chunk and build up the time series
    for chunk in chunks:
        ys, ye = chunk[0]
        xs, xe = chunk[1]
        ysize = ye - ys
        xsize = xe - xs
        dims = (time_slices, ysize, xsize)

        # Initialise the intermediate and best_pixel output arrays
        data = {}
        best_pixel_nbar = {}
        best_pixel_fc = {}
        stack_bare_soil = numpy.zeros(dims, dtype='float32')
        stack_sat = numpy.zeros(dims, dtype='int16')
        stack_date = numpy.zeros(dims, dtype='int32')
        best_pixel_satellite = numpy.zeros((ysize, xsize), dtype='int16')
        best_pixel_date = numpy.zeros((ysize, xsize), dtype='float64')
        best_pixel_satellite.fill(no_data_value)
        best_pixel_date.fill(no_data_value)

        stack_nbar = {}
        for band in Ls57Arg25Bands:
            stack_nbar[band] = numpy.zeros(dims, dtype='int16')
            best_pixel_nbar[band] = numpy.zeros((ysize, xsize),
                                                dtype='int16')
            best_pixel_nbar[band].fill(no_data_value)

        stack_fc = {}
        for band in fc_bands_subset:
            stack_fc[band] = numpy.zeros(dims, dtype='int16')
            best_pixel_fc[band] = numpy.zeros((ysize, xsize),
                                              dtype='int16')
            best_pixel_fc[band].fill(no_data_value)

        for idx, ds in enumerate(tiles):

            pqa = ds.datasets[DatasetType.PQ25]
            nbar = ds.datasets[DatasetType.ARG25]
            fc = ds.datasets[DatasetType.FC25]
            try:
                wofs = ds.datasets[DatasetType.WATER]
            except KeyError:
                print "Missing water for:\n {}".format(ds.end_datetime)
                wofs = None

            # mask = numpy.zeros((ysize, xsize), dtype='bool')
            # TODO update to use the api's version of extract_pq
            #pq_data = get_dataset_data(pqa, x=xs, y=ys, x_size=xsize,
            #                           y_size=ysize)[Pq25Bands.PQ]
            #mask = extract_pq_flags(pq_data, combine=True)
            #mask = ~mask
            mask = get_mask_pqa(pqa, x=xs, y=ys, x_size=xsize, y_size=ysize)

            # WOfS
            if wofs is not None:
                mask = get_mask_wofs(wofs, x=xs, y=ys, x_size=xsize,
                                     y_size=ysize, mask=mask)

            # NBAR
            data[DatasetType.ARG25] = get_dataset_data(nbar, x=xs, y=ys,
                                                       x_size=xsize,
                                                       y_size=ysize)

            # NDVI
            red = data[DatasetType.ARG25][Ls57Arg25Bands.RED]
            nir = data[DatasetType.ARG25][Ls57Arg25Bands.NEAR_INFRARED]
            ndvi = calculate_ndvi(red, nir)
            ndvi[mask] = no_data_value
            mask |= numexpr.evaluate("(ndvi < 0.0) | (ndvi > 0.3)")

            # FC
            data[DatasetType.FC25] = get_dataset_data(fc, x=xs, y=ys,
                                                      x_size=xsize,
                                                      y_size=ysize)
            bare_soil = data[DatasetType.FC25][Fc25Bands.BARE_SOIL]
            mask |= numexpr.evaluate("(bare_soil < 0) | (bare_soil > 8000)")

            # apply the mask to each dataset and insert into the 3D array
            for band in Ls57Arg25Bands:
                data[DatasetType.ARG25][band][mask] = no_data_value
                stack_nbar[band][idx] = data[DatasetType.ARG25][band]

            for band in fc_bands_subset:
                data[DatasetType.FC25][band][mask] = no_data_value
                stack_fc[band][idx] = data[DatasetType.FC25][band]

            # Add bare soil, satellite and date to the 3D arrays
            stack_bare_soil[idx] = bare_soil
            stack_bare_soil[idx][mask] = nan
            stack_sat[idx][:] = satellite_code[fc.satellite]
            dtime = int(ds.end_datetime.strftime('%Y%m%d'))
            stack_date[idx][:] = dtime

        # Calcualte the percentile
        pct_fc = numpy.nanpercentile(stack_bare_soil, percentile,
                                     axis=0, interpolation='nearest')

        # Loop over each time slice and generate a mosaic for each dataset_type
        for idx in range(time_slices):
            pct_idx = pct_fc == stack_bare_soil[idx]

            for band in Ls57Arg25Bands:
                band_data = stack_nbar[band]
                best_pixel_nbar[band][pct_idx] = band_data[idx][pct_idx]

            for band in fc_bands_subset:
                band_data = stack_fc[band]
                best_pixel_fc[band][pct_idx] = band_data[idx][pct_idx]

            best_pixel_satellite[pct_idx] = stack_sat[idx][pct_idx]
            best_pixel_date[pct_idx] = stack_date[idx][pct_idx]

        # Output the current spatial chunk for each dataset
        for band in Ls57Arg25Bands:
            bn = band.value
            band_data = best_pixel_nbar[band]
            nbar_outds.write_tile(band_data, chunk, raster_band=bn)

        for band in fc_bands_subset:
            bn = band.value
            band_data = best_pixel_fc[band]
            fc_outds.write_tile(band_data, chunk, raster_band=bn)

        fc_outds.write_tile(pct_fc, chunk,
                            raster_band=Fc25Bands.BARE_SOIL.value)
        sat_outds.write_tile(best_pixel_satellite, chunk)
        date_outds.write_tile(best_pixel_date, chunk)

    # Close the output files
    nbar_outds.close()
    fc_outds.close()
    sat_outds.close()
    date_outds.close()

if __name__ == '__main__':

    config = Config()
    #satellites = [Satellite(i) for i in ['LS5', 'LS7', 'LS8']]
    satellites = [Satellite(i) for i in ['LS5', 'LS7']]
    min_date = date(1987, 01, 01)
    max_date = date(2015, 12, 31)
    ds_type = DatasetType.ARG25

    x_cell = [146]
    y_cell = [-34]

    tiles = list_tiles_as_list(x=x_cell, y=y_cell, acq_min=min_date,
                               acq_max=max_date,
                               satellites=satellites,
                               datasets=ds_type,
                               database=config.get_db_database(),
                               user=config.get_db_username(),
                               password=config.get_db_password(),
                               host=config.get_db_host(),
                               port=config.get_db_port())

    #bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles')
    #bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles/pct_95', percentile=95)
    bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles/test', percentile=95)

## cell_queries.pkl
(lp1
S'/g/data/v10/testing_ground/jps547/percentiles/test2/146_-34/datasets_list.pkl'
p2
a.

## config.cfg
[work]
output_directory = /g/data/v10/testing_ground/jps547/percentiles/test2
logs_directory = %(output_directory)s/logs

[db]
satellites = LS5,LS7
min_date = 2008_01_01
max_date = 2008_03_31
cell_list = 1
xcells = 121,142,118,146,135
ycells = -29,-22,-23,-34,-18
cell_xmin = 146
cell_ymin = -34
cell_xmax = 147
cell_ymax = -33


[internals]
percentiles = 90,95
cells_per_node = 10
x_tile_size = -1
y_tile_size = 100

[outputs]
pbs_filename = bare_soil_pbsdsh.bash
query_filename = datasets_list.pkl
queries_list = cell_queries.pkl
cell_groups = cell_groups.pkl
groups = NBAR,FC,Satellite,date
basefname_format = {group}_best_pixel_{pct}_percentile.dat

[pbs]
project = v10
queue = express
walltime = 01:00:00
email = joshua.sixsmith@ga.gov.au
modules = python/2.7.6,numpy/1.9.2,gdal,rasterio,eo-tools/0.4rc,luigi-mpi,agdc-api/0.1.0-b20150622-DEWNR

## percentile_workflow.py
#!/usr/bin/env python

import luigi

import os
from os.path import join as pjoin, exists, dirname
import cPickle as pickle
import glob
import argparse
import logging

from datacube.api.model import DatasetType
from baresoil_percentile_baseline import bs_workflow


CONFIG = luigi.configuration.get_config()
CONFIG.add_config_path(pjoin(dirname(__file__), 'config.cfg'))


class PercentileTask(luigi.Task):

    """
    Computes the percentile task for a given cell.
    """

    query_fname = luigi.Parameter()
    pct = luigi.IntParameter()
    output_dir = luigi.Parameter()

    def requires(self):
        return []

    def output(self):
        basename = pjoin(self.output_dir, 'percentile_{}.completed')
        out_fname = basename.format(self.pct)
        return luigi.LocalTarget(out_fname)

    def run(self):
        data_groups = CONFIG.get('outputs', 'groups').split(',')
        base_fname_format = pjoin(self.output_dir,
                                  CONFIG.get('outputs', 'basefname_format'))
        out_fnames = []
        for group in data_groups:
            out_fnames.append(base_fname_format.format(group=group,
                                                       pct=self.pct))

        # Load the db query
        with open(self.query_fname, 'r') as f:
            db_tiles = pickle.load(f)

        # Get the processing tile sizes
        xtile = int(CONFIG.get('internals', 'x_tile_size'))
        ytile = int(CONFIG.get('internals', 'y_tile_size'))
        xtile = None if xtile <= 0 else xtile
        ytile = None if ytile <= 0 else ytile

        # Execute
        bs_workflow(db_tiles, percentile=self.pct, xtile=xtile,
                    ytile=ytile, out_fnames=out_fnames)

        with self.output().open('w') as outf:
            outf.write('Complete')


class DefinePercentileTasks(luigi.Task):

    """
    Issues PercentileTask's to each cell associated
    with the tile defined by the start and end index.
    """

    idx1 = luigi.IntParameter()
    idx2 = luigi.IntParameter()

    def requires(self):
        base_out_dir = CONFIG.get('work', 'output_directory')
        queries_fname = pjoin(base_out_dir,
                              CONFIG.get('outputs', 'queries_list'))
        with open(queries_fname, 'r') as infile:
            cell_queries = pickle.load(infile)

        percentiles = CONFIG.get('internals', 'percentiles').split(',')
        percentiles = [int(pct) for pct in percentiles]

        tasks = []
        for cell_query in cell_queries[self.idx1:self.idx2]:
            print cell_query
            out_dir = dirname(cell_query)
            for pct in percentiles:
                tasks.append(PercentileTask(cell_query, pct, out_dir))

        return tasks

    def output(self):
        out_dir = CONFIG.get('work', 'output_directory')
        out_fname = pjoin(out_dir,
                          'DefinePercentileTasks{}:{}.completed')
        out_fname = out_fname.format(self.idx1, self.idx2)

        return luigi.LocalTarget(out_fname)

    def run(self):
        with self.output().open('w') as outf:
            outf.write('Completed')


if __name__ == '__main__':
    # Setup command-line arguments
    desc = "Processes zonal stats for a given set of cells."
    hlp = ("The tile/chunk index to retieve from the tiles list. "
           "(Needs to have been previously computed to a file named tiles.pkl")
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument('--tile', type=int, help=hlp)

    parsed_args = parser.parse_args()
    tile_idx = parsed_args.tile


    # setup logging
    log_dir = CONFIG.get('work', 'logs_directory')
    if not exists(log_dir):
        os.makedirs(log_dir)

    logfile = "{log_path}/stats_workflow_{uname}_{pid}.log"
    logfile = logfile.format(log_path=log_dir, uname=os.uname()[1],
                             pid=os.getpid())
    logging_level = logging.INFO
    logging.basicConfig(filename=logfile, level=logging_level,
                        format=("%(asctime)s: [%(name)s] (%(levelname)s) "
                                "%(message)s "))


    # Get the list of tiles (groups of cells that each node will operate on
    tiles_list_fname = pjoin(CONFIG.get('work', 'output_directory'),
                             CONFIG.get('outputs', 'cell_groups'))
    with open(tiles_list_fname, 'r') as in_file:
        tiles = pickle.load(in_file)

    # Initialise the job
    tile = tiles[tile_idx]
    tasks = [DefinePercentileTasks(tile[0], tile[1])]
    luigi.build(tasks, local_scheduler=True, workers=16)
    luigi.run()

## query.py
#!/usr/bin/env python

import luigi

import argparse
from datetime import date
import os
from os.path import join as pjoin, exists, dirname, abspath
import cPickle as pickle
import subprocess
import logging

from datacube.api.query import list_tiles_as_list
from datacube.api.model import DatasetType, Satellite, BANDS
from eotools.tiling import generate_tiles


PBS_DSH = (
"""#!/bin/bash

#PBS -P {project}
#PBS -q {queue}
#PBS -l walltime={walltime},ncpus={ncpus},mem={mem}GB,jobfs=350GB
#PBS -l wd
#PBS -me
#PBS -M {email}

NNODES={nnodes}

for i in $(seq 1 $NNODES); do
   pbsdsh -n $((16 *$i)) -- bash -l -c "{modules} PBS_NNODES=$NNODES PBS_VNODENUM=$i python {pyfile} \
   --tile $[$i - 1]" &
done;
wait
""")


def query_cells(xcells, ycells, satellites, min_date, max_date, dataset_types,
                output_dir):
    """
    Query the DB for each cell.
    Currently the config file for this workflow requires the user to
    specify a rectangular region. I have another workflow that takes a
    vector file as input.
    """
    base_out_fname = CONFIG.get('outputs', 'query_filename')
    cell_queries = []
    for ycell in ycells:
        for xcell in xcells:
            # create the output directory
            cell_dir = '{}_{}'.format(int(xcell), int(ycell))
            out_cell_dir = pjoin(output_dir, cell_dir)
            if not exists(out_cell_dir):
                os.makedirs(out_cell_dir)
            tiles = list_tiles_as_list(x=[xcell], y=[ycell],
                                       acq_min=min_date,
                                       acq_max=max_date,
                                       dataset_types=dataset_types,
                                       satellites=satellites)
            out_fname = pjoin(out_cell_dir, base_out_fname)
            cell_queries.append(out_fname)
            with open(out_fname, 'w') as outf:
                pickle.dump(tiles, outf)

    return cell_queries


def query_cells2(xcells, ycells, satellites, min_date, max_date, dataset_types,
                output_dir):
    """
    Query the DB for each cell.
    Currently the config file for this workflow requires the user to
    specify a rectangular region. I have another workflow that takes a
    vector file as input.
    """
    base_out_fname = CONFIG.get('outputs', 'query_filename')
    cell_queries = []
    for i in range(len(ycells)):
        ycell = ycells[i]
        xcell = xcells[i]
        # create the output directory
        cell_dir = '{}_{}'.format(int(xcell), int(ycell))
        out_cell_dir = pjoin(output_dir, cell_dir)
        if not exists(out_cell_dir):
            os.makedirs(out_cell_dir)
        tiles = list_tiles_as_list(x=[xcell], y=[ycell],
                                   acq_min=min_date,
                                   acq_max=max_date,
                                   dataset_types=dataset_types,
                                   satellites=satellites)
        out_fname = pjoin(out_cell_dir, base_out_fname)
        cell_queries.append(out_fname)
        with open(out_fname, 'w') as outf:
            pickle.dump(tiles, outf)

    return cell_queries


if __name__ == '__main__':
    desc = "Queries the AGDC for the percentile workflow."
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument("--cfg",
                        help=("The config file used to setup "
                              "baresoil workflow."))

    args = parser.parse_args()
    cfg = args.cfg

    # Setup the config file
    global CONFIG
    if cfg is None:
        CONFIG = luigi.configuration.get_config()
        CONFIG.add_config_path(pjoin(dirname(__file__), 'config.cfg'))
    else:
        CONFIG = luigi.configuration.get_config()
        CONFIG.add_config_path(cfg)


    # Create the output directory
    out_dir = CONFIG.get('work', 'output_directory')
    if not exists(out_dir):
        os.makedirs(out_dir)

    # setup logging
    log_dir = CONFIG.get('work', 'logs_directory')
    if not exists(log_dir):
        os.makedirs(log_dir)

    logfile = "{log_path}/query_{uname}_{pid}.log"
    logfile = logfile.format(log_path=log_dir, uname=os.uname()[1],
                             pid=os.getpid())
    logging_level = logging.INFO
    logging.basicConfig(filename=logfile, level=logging_level,
                        format=("%(asctime)s: [%(name)s] (%(levelname)s) "
                                "%(message)s "))

    # Define the DatasetTypes
    # TODO have this defined through the config.cfg
    ds_types = [DatasetType.ARG25, DatasetType.FC25, DatasetType.PQ25,
                DatasetType.WATER]

    # Get the satellites we wish to query
    satellites = CONFIG.get('db', 'satellites')
    satellites = [Satellite(i) for i in satellites.split(',')]

    # Get the min/max date range to query
    min_date = CONFIG.get('db', 'min_date')
    min_date = [int(i) for i in min_date.split('_')]
    min_date = date(min_date[0], min_date[1], min_date[2])
    max_date = CONFIG.get('db', 'max_date')
    max_date = [int(i) for i in max_date.split('_')]
    max_date = date(max_date[0], max_date[1], max_date[2])

    # Do we use a list a recatangular block of cells
    cell_list = int(CONFIG.get('db', 'cell_list'))
    if cell_list <= 0:
        # Get the min/max cell range to query
        cell_xmin = int(CONFIG.get('db', 'cell_xmin'))
        cell_ymin = int(CONFIG.get('db', 'cell_ymin'))
        cell_xmax = int(CONFIG.get('db', 'cell_xmax'))
        cell_ymax = int(CONFIG.get('db', 'cell_ymax'))
        xcells = range(cell_xmin, cell_xmax, 1)
        ycells = range(cell_ymin, cell_ymax, 1)
    else:
        xcells = CONFIG.get('db', 'xcells')
        xcells = [int(x) for x in xcells.split(',')]
        ycells = CONFIG.get('db', 'ycells')
        ycells = [int(y) for y in ycells.split(',')]


    output_dir = CONFIG.get('work', 'output_directory')
    cell_queries = query_cells2(xcells, ycells, satellites, min_date, max_date,
                                ds_types, output_dir)

    queries_fname = pjoin(output_dir, CONFIG.get('outputs', 'queries_list'))
    with open(queries_fname, 'w') as outf:
        pickle.dump(cell_queries, outf)


    # Create the tiles list that contains groups of cells to operate on
    # Each node will have a certain number of cells to work on
    cpnode = int(CONFIG.get('internals', 'cells_per_node'))
    tiles = generate_tiles(len(cell_queries), 1, cpnode, 1, False)
    tiles = [x for y, x in tiles]
    tiles_list_fname = pjoin(out_dir, CONFIG.get('outputs', 'cell_groups'))
    with open(tiles_list_fname, 'w') as out_file:
        pickle.dump(tiles, out_file)


    # Setup the modules to use for the job
    modules = CONFIG.get('pbs', 'modules').split(',')

    modules = ['module load {}; '.format(module) for module in modules]
    modules.insert(0, 'module use /projects/u46/opt/modules/modulefiles;')
    modules = ''.join(modules)

    # Calculate the job node and cpu requirements
    nnodes = len(tiles)
    ncpus = nnodes * 16
    mem = nnodes * 32

    # Populate the PBS shell script
    project = CONFIG.get('pbs', 'project')
    queue = CONFIG.get('pbs', 'queue')
    walltime = CONFIG.get('pbs', 'walltime')
    email = CONFIG.get('pbs', 'email')
    py_file = abspath(pjoin(dirname(__file__), 'percentile_workflow.py'))
    py_path = abspath(dirname(__file__))
    pbs_job = PBS_DSH.format(project=project, queue=queue,
                             walltime=walltime, ncpus=ncpus, mem=mem,
                             email=email, nnodes=nnodes,
                             modules=modules,
                             pyfile=py_file)

    # Out put the shell script to disk
    pbs_fname = pjoin(out_dir, CONFIG.get('outputs', 'pbs_filename'))
    with open(pbs_fname, 'w') as out_file:
        out_file.write(pbs_job)

    subprocess.check_call(['qsub', pbs_fname])
	#!/usr/bin/env python

	from datetime import date
	from os.path import join as pjoin
	import gdal
	import luigi
	import numpy
	import numexpr

	from datacube.api.model import DatasetType, Fc25Bands, Ls57Arg25Bands
	from datacube.api.model import Pq25Bands
	from datacube.api.model import Satellite
	from datacube.api.query import list_tiles_as_list
	from datacube.config import Config
	from datacube.api.utils import get_dataset_metadata, get_mask_pqa
	from datacube.api.utils import NDV, calculate_ndvi#, date_to_integer
	from datacube.api.utils import get_dataset_data, get_mask_wofs
	from eotools.tiling import generate_tiles
	from eotools.tiling import TiledOutput
	from eotools.geobox import GriddedGeoBox
	from eotools.pq_utils import extract_pq_flags
	from eotools.pq_utils import pq_apply_dict


	def bs_workflow(tiles, percentile=90, xtile=None, ytile=None,
	out_fnames=None):
	"""
	A baseline workflow for doing the baresoil percentile, NBAR, FC
	corresponding mosaics.
	"""
	# Get some basic image info
	ds_type = DatasetType.FC25
	ds = tiles[0]
	dataset = ds.datasets[ds_type]
	md = get_dataset_metadata(dataset)
	samples, lines = md.shape
	time_slices = len(tiles)
	geobox = GriddedGeoBox.from_gdal_dataset(gdal.Open(dataset.path))

	# Initialise the tiling scheme for processing
	if xtile is None:
	xtile = samples
	if ytile is None:
	ytile = lines
	chunks = generate_tiles(samples, lines, xtile=samples, ytile=ytile,
	generator=False)

	# Define no-data
	no_data_value = NDV
	nan = numpy.float32(numpy.nan) # for the FC dtype no need for float64

	# Define the output files
	if out_fnames is None:
	nbar_outfname = 'nbar_best_pixel'
	fc_outfname = 'fc_best_pixel'
	sat_outfname = 'sat_best_pixel'
	date_outfnme = 'date_best_pixel'
	else:
	nbar_outfname = out_fnames[0]
	fc_outfname = out_fnames[1]
	sat_outfname = out_fnames[2]
	date_outfnme = out_fnames[3]

	nbar_outnb = len(Ls57Arg25Bands)
	fc_outnb = len(Fc25Bands)
	out_dtype = gdal.GDT_Int16

	nbar_outds = TiledOutput(nbar_outfname, samples=samples, lines=lines,
	bands=nbar_outnb, dtype=out_dtype,
	nodata=no_data_value, geobox=geobox)
	fc_outds = TiledOutput(fc_outfname, samples=samples, lines=lines,
	bands=fc_outnb, dtype=out_dtype,
	nodata=no_data_value, geobox=geobox)
	sat_outds = TiledOutput(sat_outfname, samples=samples, lines=lines,
	dtype=out_dtype, nodata=no_data_value,
	geobox=geobox)
	date_outds = TiledOutput(date_outfnme, samples=samples, lines=lines,
	dtype=gdal.GDT_Float64, nodata=no_data_value,
	geobox=geobox)

	satellite_code = {Satellite.LS5: 5, Satellite.LS7: 7, Satellite.LS8: 8}
	fc_bands_subset = [Fc25Bands.PHOTOSYNTHETIC_VEGETATION,
	Fc25Bands.NON_PHOTOSYNTHETIC_VEGETATION,
	Fc25Bands.UNMIXING_ERROR]

	# Loop over each spatial tile/chunk and build up the time series
	for chunk in chunks:
	ys, ye = chunk[0]
	xs, xe = chunk[1]
	ysize = ye - ys
	xsize = xe - xs
	dims = (time_slices, ysize, xsize)

	# Initialise the intermediate and best_pixel output arrays
	data = {}
	best_pixel_nbar = {}
	best_pixel_fc = {}
	stack_bare_soil = numpy.zeros(dims, dtype='float32')
	stack_sat = numpy.zeros(dims, dtype='int16')
	stack_date = numpy.zeros(dims, dtype='int32')
	best_pixel_satellite = numpy.zeros((ysize, xsize), dtype='int16')
	best_pixel_date = numpy.zeros((ysize, xsize), dtype='float64')
	best_pixel_satellite.fill(no_data_value)
	best_pixel_date.fill(no_data_value)

	stack_nbar = {}
	for band in Ls57Arg25Bands:
	stack_nbar[band] = numpy.zeros(dims, dtype='int16')
	best_pixel_nbar[band] = numpy.zeros((ysize, xsize),
	dtype='int16')
	best_pixel_nbar[band].fill(no_data_value)

	stack_fc = {}
	for band in fc_bands_subset:
	stack_fc[band] = numpy.zeros(dims, dtype='int16')
	best_pixel_fc[band] = numpy.zeros((ysize, xsize),
	dtype='int16')
	best_pixel_fc[band].fill(no_data_value)

	for idx, ds in enumerate(tiles):

	pqa = ds.datasets[DatasetType.PQ25]
	nbar = ds.datasets[DatasetType.ARG25]
	fc = ds.datasets[DatasetType.FC25]
	try:
	wofs = ds.datasets[DatasetType.WATER]
	except KeyError:
	print "Missing water for:\n {}".format(ds.end_datetime)
	wofs = None

	# mask = numpy.zeros((ysize, xsize), dtype='bool')
	# TODO update to use the api's version of extract_pq
	#pq_data = get_dataset_data(pqa, x=xs, y=ys, x_size=xsize,
	# y_size=ysize)[Pq25Bands.PQ]
	#mask = extract_pq_flags(pq_data, combine=True)
	#mask = ~mask
	mask = get_mask_pqa(pqa, x=xs, y=ys, x_size=xsize, y_size=ysize)

	# WOfS
	if wofs is not None:
	mask = get_mask_wofs(wofs, x=xs, y=ys, x_size=xsize,
	y_size=ysize, mask=mask)

	# NBAR
	data[DatasetType.ARG25] = get_dataset_data(nbar, x=xs, y=ys,
	x_size=xsize,
	y_size=ysize)

	# NDVI
	red = data[DatasetType.ARG25][Ls57Arg25Bands.RED]
	nir = data[DatasetType.ARG25][Ls57Arg25Bands.NEAR_INFRARED]
	ndvi = calculate_ndvi(red, nir)
	ndvi[mask] = no_data_value
	mask \|= numexpr.evaluate("(ndvi < 0.0) \| (ndvi > 0.3)")

	# FC
	data[DatasetType.FC25] = get_dataset_data(fc, x=xs, y=ys,
	x_size=xsize,
	y_size=ysize)
	bare_soil = data[DatasetType.FC25][Fc25Bands.BARE_SOIL]
	mask \|= numexpr.evaluate("(bare_soil < 0) \| (bare_soil > 8000)")

	# apply the mask to each dataset and insert into the 3D array
	for band in Ls57Arg25Bands:
	data[DatasetType.ARG25][band][mask] = no_data_value
	stack_nbar[band][idx] = data[DatasetType.ARG25][band]

	for band in fc_bands_subset:
	data[DatasetType.FC25][band][mask] = no_data_value
	stack_fc[band][idx] = data[DatasetType.FC25][band]

	# Add bare soil, satellite and date to the 3D arrays
	stack_bare_soil[idx] = bare_soil
	stack_bare_soil[idx][mask] = nan
	stack_sat[idx][:] = satellite_code[fc.satellite]
	dtime = int(ds.end_datetime.strftime('%Y%m%d'))
	stack_date[idx][:] = dtime

	# Calcualte the percentile
	pct_fc = numpy.nanpercentile(stack_bare_soil, percentile,
	axis=0, interpolation='nearest')

	# Loop over each time slice and generate a mosaic for each dataset_type
	for idx in range(time_slices):
	pct_idx = pct_fc == stack_bare_soil[idx]

	for band in Ls57Arg25Bands:
	band_data = stack_nbar[band]
	best_pixel_nbar[band][pct_idx] = band_data[idx][pct_idx]

	for band in fc_bands_subset:
	band_data = stack_fc[band]
	best_pixel_fc[band][pct_idx] = band_data[idx][pct_idx]

	best_pixel_satellite[pct_idx] = stack_sat[idx][pct_idx]
	best_pixel_date[pct_idx] = stack_date[idx][pct_idx]

	# Output the current spatial chunk for each dataset
	for band in Ls57Arg25Bands:
	bn = band.value
	band_data = best_pixel_nbar[band]
	nbar_outds.write_tile(band_data, chunk, raster_band=bn)

	for band in fc_bands_subset:
	bn = band.value
	band_data = best_pixel_fc[band]
	fc_outds.write_tile(band_data, chunk, raster_band=bn)

	fc_outds.write_tile(pct_fc, chunk,
	raster_band=Fc25Bands.BARE_SOIL.value)
	sat_outds.write_tile(best_pixel_satellite, chunk)
	date_outds.write_tile(best_pixel_date, chunk)

	# Close the output files
	nbar_outds.close()
	fc_outds.close()
	sat_outds.close()
	date_outds.close()

	if __name__ == '__main__':

	config = Config()
	#satellites = [Satellite(i) for i in ['LS5', 'LS7', 'LS8']]
	satellites = [Satellite(i) for i in ['LS5', 'LS7']]
	min_date = date(1987, 01, 01)
	max_date = date(2015, 12, 31)
	ds_type = DatasetType.ARG25

	x_cell = [146]
	y_cell = [-34]

	tiles = list_tiles_as_list(x=x_cell, y=y_cell, acq_min=min_date,
	acq_max=max_date,
	satellites=satellites,
	datasets=ds_type,
	database=config.get_db_database(),
	user=config.get_db_username(),
	password=config.get_db_password(),
	host=config.get_db_host(),
	port=config.get_db_port())

	#bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles')
	#bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles/pct_95', percentile=95)
	bs_workflow(tiles, outdir='/g/data/v10/testing_ground/jps547/percentiles/test', percentile=95)
	(lp1
	S'/g/data/v10/testing_ground/jps547/percentiles/test2/146_-34/datasets_list.pkl'
	p2
	a.
	[work]
	output_directory = /g/data/v10/testing_ground/jps547/percentiles/test2
	logs_directory = %(output_directory)s/logs

	[db]
	satellites = LS5,LS7
	min_date = 2008_01_01
	max_date = 2008_03_31
	cell_list = 1
	xcells = 121,142,118,146,135
	ycells = -29,-22,-23,-34,-18
	cell_xmin = 146
	cell_ymin = -34
	cell_xmax = 147
	cell_ymax = -33


	[internals]
	percentiles = 90,95
	cells_per_node = 10
	x_tile_size = -1
	y_tile_size = 100

	[outputs]
	pbs_filename = bare_soil_pbsdsh.bash
	query_filename = datasets_list.pkl
	queries_list = cell_queries.pkl
	cell_groups = cell_groups.pkl
	groups = NBAR,FC,Satellite,date
	basefname_format = {group}_best_pixel_{pct}_percentile.dat

	[pbs]
	project = v10
	queue = express
	walltime = 01:00:00
	email = joshua.sixsmith@ga.gov.au
	modules = python/2.7.6,numpy/1.9.2,gdal,rasterio,eo-tools/0.4rc,luigi-mpi,agdc-api/0.1.0-b20150622-DEWNR
	#!/usr/bin/env python

	import luigi

	import os
	from os.path import join as pjoin, exists, dirname
	import cPickle as pickle
	import glob
	import argparse
	import logging

	from datacube.api.model import DatasetType
	from baresoil_percentile_baseline import bs_workflow


	CONFIG = luigi.configuration.get_config()
	CONFIG.add_config_path(pjoin(dirname(__file__), 'config.cfg'))


	class PercentileTask(luigi.Task):

	"""
	Computes the percentile task for a given cell.
	"""

	query_fname = luigi.Parameter()
	pct = luigi.IntParameter()
	output_dir = luigi.Parameter()

	def requires(self):
	return []

	def output(self):
	basename = pjoin(self.output_dir, 'percentile_{}.completed')
	out_fname = basename.format(self.pct)
	return luigi.LocalTarget(out_fname)

	def run(self):
	data_groups = CONFIG.get('outputs', 'groups').split(',')
	base_fname_format = pjoin(self.output_dir,
	CONFIG.get('outputs', 'basefname_format'))
	out_fnames = []
	for group in data_groups:
	out_fnames.append(base_fname_format.format(group=group,
	pct=self.pct))

	# Load the db query
	with open(self.query_fname, 'r') as f:
	db_tiles = pickle.load(f)

	# Get the processing tile sizes
	xtile = int(CONFIG.get('internals', 'x_tile_size'))
	ytile = int(CONFIG.get('internals', 'y_tile_size'))
	xtile = None if xtile <= 0 else xtile
	ytile = None if ytile <= 0 else ytile

	# Execute
	bs_workflow(db_tiles, percentile=self.pct, xtile=xtile,
	ytile=ytile, out_fnames=out_fnames)

	with self.output().open('w') as outf:
	outf.write('Complete')


	class DefinePercentileTasks(luigi.Task):

	"""
	Issues PercentileTask's to each cell associated
	with the tile defined by the start and end index.
	"""

	idx1 = luigi.IntParameter()
	idx2 = luigi.IntParameter()

	def requires(self):
	base_out_dir = CONFIG.get('work', 'output_directory')
	queries_fname = pjoin(base_out_dir,
	CONFIG.get('outputs', 'queries_list'))
	with open(queries_fname, 'r') as infile:
	cell_queries = pickle.load(infile)

	percentiles = CONFIG.get('internals', 'percentiles').split(',')
	percentiles = [int(pct) for pct in percentiles]

	tasks = []
	for cell_query in cell_queries[self.idx1:self.idx2]:
	print cell_query
	out_dir = dirname(cell_query)
	for pct in percentiles:
	tasks.append(PercentileTask(cell_query, pct, out_dir))

	return tasks

	def output(self):
	out_dir = CONFIG.get('work', 'output_directory')
	out_fname = pjoin(out_dir,
	'DefinePercentileTasks{}:{}.completed')
	out_fname = out_fname.format(self.idx1, self.idx2)

	return luigi.LocalTarget(out_fname)

	def run(self):
	with self.output().open('w') as outf:
	outf.write('Completed')


	if __name__ == '__main__':
	# Setup command-line arguments
	desc = "Processes zonal stats for a given set of cells."
	hlp = ("The tile/chunk index to retieve from the tiles list. "
	"(Needs to have been previously computed to a file named tiles.pkl")
	parser = argparse.ArgumentParser(description=desc)
	parser.add_argument('--tile', type=int, help=hlp)

	parsed_args = parser.parse_args()
	tile_idx = parsed_args.tile


	# setup logging
	log_dir = CONFIG.get('work', 'logs_directory')
	if not exists(log_dir):
	os.makedirs(log_dir)

	logfile = "{log_path}/stats_workflow_{uname}_{pid}.log"
	logfile = logfile.format(log_path=log_dir, uname=os.uname()[1],
	pid=os.getpid())
	logging_level = logging.INFO
	logging.basicConfig(filename=logfile, level=logging_level,
	format=("%(asctime)s: [%(name)s] (%(levelname)s) "
	"%(message)s "))


	# Get the list of tiles (groups of cells that each node will operate on
	tiles_list_fname = pjoin(CONFIG.get('work', 'output_directory'),
	CONFIG.get('outputs', 'cell_groups'))
	with open(tiles_list_fname, 'r') as in_file:
	tiles = pickle.load(in_file)

	# Initialise the job
	tile = tiles[tile_idx]
	tasks = [DefinePercentileTasks(tile[0], tile[1])]
	luigi.build(tasks, local_scheduler=True, workers=16)
	luigi.run()