khaeru/dag_reporting.py

## dag_reporting.py
"""Reporting using a directed acyclic graph (DAG).
2019-01-24 Paul Natsuo Kishimoto <mail@paul.kishimoto.name>

This is a demo of one possible pattern for a reporting architecture for ixmp
/MESSAGE. It uses a directed acyclic graph (DAG) where the nodes are operations
and edges are data. Calling get(…) on the graph causes a node's output, and all
its dependencies, to be retrieved.

This is implemented using dask:

- http://docs.dask.org/en/latest/graphs.html
- http://docs.dask.org/en/latest/optimize.html#example

…but any other package which supports computation on graphs could be used.


The implementation is in three parts:

1. Very minimal data structures to mock ixmp.Scenario.
2. The internal pieces:

   - Basic node (calculation) operations.
   - Tools for manipulating graphs of nodes.

3. The user interface. <-- Look at this first!

"""
from functools import partial

import dask
import numpy as np
import xarray as xr


# 1. Very minimal demonstration data ------------------------------------------

# Model dimensions
coords = {
    'level': ['primary', 'final'],
    'species': ['CO2', 'CH4'],
    'region': ['AT', 'CA', 'US'],
    'year': list(range(2020, 2051, 10)),
    }

# Model quantities and their dimensions
vars = {
    'gdp': ('region', 'year'),
    'population': ('region', 'year'),
    'energy': ('region', 'year', 'level'),
    'emissions': ('region', 'year', 'species'),
    }


# A toy 'Scenario' with a 'data' attribute
class Scenario:
    """An object that generates some fake data."""
    def __init__(self):
        self.name = 'Demo scenario'
        self.data = xr.Dataset({}, coords)

        # Populate with random contents
        for var, dims in vars.items():
            size = [len(coords[d]) for d in dims]
            self.data[var] = (dims, np.random.rand(*size))

    def __repr__(self):
        return '<Scenario>'


# 2. Internal pieces ----------------------------------------------------------

# Node operations
def data(scenario, name):
    """Retrieve the variable *name* from *scenario*.

    Similar methods could ingest:

    - non-model data needed for computing derived quantities, or
    - configuration values.
    """
    return scenario.data[name]


def aggregate(data, dimension):
    """Aggregate data over a single dimension."""
    print('Computing an aggregate!')
    return data.sum(dim=dimension)


def cumulative(data):
    """Cumulative sum over years."""
    return data.sum(dim='year')


def ratio(num, denom):
    print('Computing a ratio!')
    return num / denom


def simple_report(scenario, gdp_cap):
    """Perform some reporting.

    This method could also:

    - write to file, a database, etc.
    - return a value in a specific data type, e.g. pyam.IamDataFrame.

    """
    print('Scenario name: {0.name}'.format(scenario),
          'GDP per capita:',
          gdp_cap.to_dataframe(name='gdp_cap').unstack('year'),
          '…done!', sep='\n\n')


# Tools for manipulating graphs
def add_vars(graph):
    """Add nodes to yield each raw model variable from the Scenario."""
    for v in vars.keys():
        graph[v] = (partial(data, name=v), 'scenario')
    return graph


def basic_graph(scenario):
    """Return a minimal graph."""
    result = {'scenario': Scenario()}
    result = add_vars(result)

    # Add some operations automatically. They aren't computed unless a call to
    # get() necessitates.
    result['gdp_cap'] = (ratio, 'gdp', 'population')
    result['report'] = (simple_report, 'scenario', 'gdp_cap')

    return result


def print_graph(graph):
    print(*['{}: {}'.format(k, v) for k, v in graph.items()], '', sep='\n')


# 3. User interface -----------------------------------------------------------

# Suppose here we have an ixmp.Scenario that has been solved
s = Scenario()

# Construct a basic graph using this Scenario's data
graph = basic_graph(s)


# Show the data structure that defines the graph
print_graph(graph)


# Get an automatically-provided quantity. Note that aggregate() is not called
print(dask.get(graph, 'gdp_cap'))


# Trigger an automatically-provided report
dask.get(graph, 'report')


# Add another node
graph['total emissions'] = (aggregate, 'emissions', ['region', 'year'])
# Note that ratio() is not called
print(dask.get(graph, 'total emissions'))


# Define and add a custom operation
def custom_sum(emissions, gdp):
    """Add apples to oranges…utter madness!"""
    return (emissions.loc['AT', 2020, 'CH4'] + gdp.loc['US', 2050]).values


graph['nonsense'] = (custom_sum, 'emissions', 'gdp')
print(dask.get(graph, 'nonsense'))
	"""Reporting using a directed acyclic graph (DAG).
	2019-01-24 Paul Natsuo Kishimoto <mail@paul.kishimoto.name>

	This is a demo of one possible pattern for a reporting architecture for ixmp
	/MESSAGE. It uses a directed acyclic graph (DAG) where the nodes are operations
	and edges are data. Calling get(…) on the graph causes a node's output, and all
	its dependencies, to be retrieved.

	This is implemented using dask:

	- http://docs.dask.org/en/latest/graphs.html
	- http://docs.dask.org/en/latest/optimize.html#example

	…but any other package which supports computation on graphs could be used.


	The implementation is in three parts:

	1. Very minimal data structures to mock ixmp.Scenario.
	2. The internal pieces:

	- Basic node (calculation) operations.
	- Tools for manipulating graphs of nodes.

	3. The user interface. <-- Look at this first!

	"""
	from functools import partial

	import dask
	import numpy as np
	import xarray as xr


	# 1. Very minimal demonstration data ------------------------------------------

	# Model dimensions
	coords = {
	'level': ['primary', 'final'],
	'species': ['CO2', 'CH4'],
	'region': ['AT', 'CA', 'US'],
	'year': list(range(2020, 2051, 10)),
	}

	# Model quantities and their dimensions
	vars = {
	'gdp': ('region', 'year'),
	'population': ('region', 'year'),
	'energy': ('region', 'year', 'level'),
	'emissions': ('region', 'year', 'species'),
	}


	# A toy 'Scenario' with a 'data' attribute
	class Scenario:
	"""An object that generates some fake data."""
	def __init__(self):
	self.name = 'Demo scenario'
	self.data = xr.Dataset({}, coords)

	# Populate with random contents
	for var, dims in vars.items():
	size = [len(coords[d]) for d in dims]
	self.data[var] = (dims, np.random.rand(*size))

	def __repr__(self):
	return '<Scenario>'


	# 2. Internal pieces ----------------------------------------------------------

	# Node operations
	def data(scenario, name):
	"""Retrieve the variable name from scenario.

	Similar methods could ingest:

	- non-model data needed for computing derived quantities, or
	- configuration values.
	"""
	return scenario.data[name]


	def aggregate(data, dimension):
	"""Aggregate data over a single dimension."""
	print('Computing an aggregate!')
	return data.sum(dim=dimension)


	def cumulative(data):
	"""Cumulative sum over years."""
	return data.sum(dim='year')


	def ratio(num, denom):
	print('Computing a ratio!')
	return num / denom


	def simple_report(scenario, gdp_cap):
	"""Perform some reporting.

	This method could also:

	- write to file, a database, etc.
	- return a value in a specific data type, e.g. pyam.IamDataFrame.

	"""
	print('Scenario name: {0.name}'.format(scenario),
	'GDP per capita:',
	gdp_cap.to_dataframe(name='gdp_cap').unstack('year'),
	'…done!', sep='\n\n')


	# Tools for manipulating graphs
	def add_vars(graph):
	"""Add nodes to yield each raw model variable from the Scenario."""
	for v in vars.keys():
	graph[v] = (partial(data, name=v), 'scenario')
	return graph


	def basic_graph(scenario):
	"""Return a minimal graph."""
	result = {'scenario': Scenario()}
	result = add_vars(result)

	# Add some operations automatically. They aren't computed unless a call to
	# get() necessitates.
	result['gdp_cap'] = (ratio, 'gdp', 'population')
	result['report'] = (simple_report, 'scenario', 'gdp_cap')

	return result


	def print_graph(graph):
	print(*['{}: {}'.format(k, v) for k, v in graph.items()], '', sep='\n')


	# 3. User interface -----------------------------------------------------------

	# Suppose here we have an ixmp.Scenario that has been solved
	s = Scenario()

	# Construct a basic graph using this Scenario's data
	graph = basic_graph(s)


	# Show the data structure that defines the graph
	print_graph(graph)


	# Get an automatically-provided quantity. Note that aggregate() is not called
	print(dask.get(graph, 'gdp_cap'))


	# Trigger an automatically-provided report
	dask.get(graph, 'report')


	# Add another node
	graph['total emissions'] = (aggregate, 'emissions', ['region', 'year'])
	# Note that ratio() is not called
	print(dask.get(graph, 'total emissions'))


	# Define and add a custom operation
	def custom_sum(emissions, gdp):
	"""Add apples to oranges…utter madness!"""
	return (emissions.loc['AT', 2020, 'CH4'] + gdp.loc['US', 2050]).values


	graph['nonsense'] = (custom_sum, 'emissions', 'gdp')
	print(dask.get(graph, 'nonsense'))