bradyrx/dm_test.py

## dm_test.py
from scipy.stats import t
import numpy as np
import xarray as xr


def autocovariance(Xi, N, k, Xs):
    autoCov = 0
    T = float(N)
    for i in np.arange(0, N-k):
            autoCov += ((Xi[i+k])-Xs)*(Xi[i]-Xs)
    return (1/(T))*autoCov


def dm_test(target, fct1, fct2, metric='rmse'):
    """Compute the Diebold-Mariano test for MSE between two forecasts.

    Args:
        target (xarray object):
            Actual values that are trying to be forecasted.
        fct1 (xarray object):
            Forecasted values for forecast method 1.
        fct2 (xarray object):
            Forecasted values for forecast method 2.
        metric (str):
            Bias metric applied to forecasts.
            * rmse
            * mae
            * mse

    Returns:
        DM_stat (xarray object):
            DM test statistic. If negative, fct1 is preferred. If positive, fct2 is
            preferred.
        pval (xarray object):
            P values associated with the DM test.

    References:
        * Boiroju, Naveen Kumar, et al. "A bootstrap test for equality of mean absolute
          errors." ARPN Journal of engineering and applied sciences 6.5 (2011): 9-11.
        * Gneiting, Tilmann, and Matthias Katzfuss. "Probabilistic forecasting." Annual
          Review of Statistics and Its Application 1 (2014): 125-151.
        * Harvey, David, Stephen Leybourne, and Paul Newbold. "Testing the equality of
          prediction mean squared errors." International Journal of forecasting 13.2
          (1997): 281-291.
        * Diebold, F. X. and Mariano, R. S. (1995), Comparing predictive accuracy,
          Journal of business & economic statistics 13(3), 253-264.
        * https://github.com/johntwk/Diebold-Mariano-Test/blob/master/dm_test.py
    """
    # drop coordinates since they'll be out of sync
    fct1 = fct1.drop('time')
    fct2 = fct2.drop('time')
    target = target.drop('time')

    # length of forecast (does it make sense that T here
    # is many forecasts? Or are they looking for each entry
    # to be a lead time?)
    T = float(len(target))

    # MSE for now
    if metric == 'mse':
        e1 = (target - fct1)**2
        e2 = (target - fct2)**2
    elif metric == 'rmse':
        e1 = np.sqrt((target - fct1)**2)
        e2 = np.sqrt((target - fct2)**2)
    elif metric == 'mae':
        e1 = np.absolute(target - fct1)
        e2 = np.absolute(target - fct2)
    # difference in two different errors
    d = e1 - e2

    # Mean of individual errors
    mean_d = d.mean()

    # Covariance
    gamma = autocovariance(d, len(d), 0, mean_d)
    V_d = gamma/T

    # Calculate DM-Statistic
    DM_stat = V_d**(-0.5)*mean_d
    p_value = 2*t.cdf(-abs(DM_stat), df=T-1)
    return xr.DataArray(DM_stat), xr.DataArray(p_value)
	from scipy.stats import t
	import numpy as np
	import xarray as xr


	def autocovariance(Xi, N, k, Xs):
	autoCov = 0
	T = float(N)
	for i in np.arange(0, N-k):
	autoCov += ((Xi[i+k])-Xs)*(Xi[i]-Xs)
	return (1/(T))*autoCov


	def dm_test(target, fct1, fct2, metric='rmse'):
	"""Compute the Diebold-Mariano test for MSE between two forecasts.

	Args:
	target (xarray object):
	Actual values that are trying to be forecasted.
	fct1 (xarray object):
	Forecasted values for forecast method 1.
	fct2 (xarray object):
	Forecasted values for forecast method 2.
	metric (str):
	Bias metric applied to forecasts.
	* rmse
	* mae
	* mse

	Returns:
	DM_stat (xarray object):
	DM test statistic. If negative, fct1 is preferred. If positive, fct2 is
	preferred.
	pval (xarray object):
	P values associated with the DM test.

	References:
	* Boiroju, Naveen Kumar, et al. "A bootstrap test for equality of mean absolute
	errors." ARPN Journal of engineering and applied sciences 6.5 (2011): 9-11.
	* Gneiting, Tilmann, and Matthias Katzfuss. "Probabilistic forecasting." Annual
	Review of Statistics and Its Application 1 (2014): 125-151.
	* Harvey, David, Stephen Leybourne, and Paul Newbold. "Testing the equality of
	prediction mean squared errors." International Journal of forecasting 13.2
	(1997): 281-291.
	* Diebold, F. X. and Mariano, R. S. (1995), Comparing predictive accuracy,
	Journal of business & economic statistics 13(3), 253-264.
	* https://github.com/johntwk/Diebold-Mariano-Test/blob/master/dm_test.py
	"""
	# drop coordinates since they'll be out of sync
	fct1 = fct1.drop('time')
	fct2 = fct2.drop('time')
	target = target.drop('time')

	# length of forecast (does it make sense that T here
	# is many forecasts? Or are they looking for each entry
	# to be a lead time?)
	T = float(len(target))

	# MSE for now
	if metric == 'mse':
	e1 = (target - fct1)**2
	e2 = (target - fct2)**2
	elif metric == 'rmse':
	e1 = np.sqrt((target - fct1)**2)
	e2 = np.sqrt((target - fct2)**2)
	elif metric == 'mae':
	e1 = np.absolute(target - fct1)
	e2 = np.absolute(target - fct2)
	# difference in two different errors
	d = e1 - e2

	# Mean of individual errors
	mean_d = d.mean()

	# Covariance
	gamma = autocovariance(d, len(d), 0, mean_d)
	V_d = gamma/T

	# Calculate DM-Statistic
	DM_stat = V_d*(-0.5)mean_d
	p_value = 2*t.cdf(-abs(DM_stat), df=T-1)
	return xr.DataArray(DM_stat), xr.DataArray(p_value)