j-faria/binit.py

## binit.py
import numpy as np

###############################################################################
# the following is mostly a copy of the scipy implementation of
# binned_statistic and binned_statistic_dd
# but allowing for a weights parameter
from scipy._lib.six import callable, xrange
from scipy._lib._numpy_compat import suppress_warnings

## careful here!
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


def binned_statistic(x, values, statistic='mean', bins=10, range=None,
                     weights=None):
    try:
        N = len(bins)
    except TypeError:
        N = 1

    if N != 1:
        bins = [np.asarray(bins, float)]

    if range is not None:
        if len(range) == 2:
            range = [range]

    medians, edges, binnumbers = binned_statistic_dd(
        [x], values, statistic, bins, range, weights)

    return medians, edges[0], binnumbers


def binned_statistic_dd(sample, values, statistic='mean', bins=10, range=None,
                        weights=None, expand_binnumbers=False):
    known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
    if not callable(statistic) and statistic not in known_stats:
        raise ValueError('invalid statistic %r' % (statistic, ))

    # `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
    # `Dlen` is the length of elements along each dimension.
    # This code is based on np.histogramdd
    try:
        # `sample` is an ND-array.
        Dlen, Ndim = sample.shape
    except (AttributeError, ValueError):
        # `sample` is a sequence of 1D arrays.
        sample = np.atleast_2d(sample).T
        Dlen, Ndim = sample.shape

    # Store initial shape of `values` to preserve it in the output
    values = np.asarray(values)
    input_shape = list(values.shape)
    # Make sure that `values` is 2D to iterate over rows
    values = np.atleast_2d(values)
    if weights is not None:
        weights = np.atleast_2d(weights)
    Vdim, Vlen = values.shape

    # Make sure `values` match `sample`
    if (statistic != 'count' and Vlen != Dlen):
        raise AttributeError('The number of `values` elements must match the '
                             'length of each `sample` dimension.')

    nbin = np.empty(Ndim, int)  # Number of bins in each dimension
    edges = Ndim * [None]  # Bin edges for each dim (will be 2D array)
    dedges = Ndim * [None]  # Spacing between edges (will be 2D array)

    try:
        M = len(bins)
        if M != Ndim:
            raise AttributeError('The dimension of bins must be equal '
                                 'to the dimension of the sample x.')
    except TypeError:
        bins = Ndim * [bins]

    # Select range for each dimension
    # Used only if number of bins is given.
    if range is None:
        smin = np.atleast_1d(np.array(sample.min(axis=0), float))
        smax = np.atleast_1d(np.array(sample.max(axis=0), float))
    else:
        smin = np.zeros(Ndim)
        smax = np.zeros(Ndim)
        for i in xrange(Ndim):
            smin[i], smax[i] = range[i]

    # Make sure the bins have a finite width.
    for i in xrange(len(smin)):
        if smin[i] == smax[i]:
            smin[i] = smin[i] - .5
            smax[i] = smax[i] + .5

    # Create edge arrays
    for i in xrange(Ndim):
        if np.isscalar(bins[i]):
            nbin[i] = bins[i] + 2  # +2 for outlier bins
            edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1)
        else:
            edges[i] = np.asarray(bins[i], float)
            nbin[i] = len(edges[i]) + 1  # +1 for outlier bins
        dedges[i] = np.diff(edges[i])

    nbin = np.asarray(nbin)

    # Compute the bin number each sample falls into, in each dimension
    sampBin = [np.digitize(sample[:, i], edges[i]) for i in xrange(Ndim)]

    # Using `digitize`, values that fall on an edge are put in the right bin.
    # For the rightmost bin, we want values equal to the right
    # edge to be counted in the last bin, and not as an outlier.
    for i in xrange(Ndim):
        # Find the rounding precision
        decimal = int(-np.log10(dedges[i].min())) + 6
        # Find which points are on the rightmost edge.
        on_edge = np.where(
            np.around(sample[:, i], decimal) == np.around(
                edges[i][-1], decimal))[0]
        # Shift these points one bin to the left.
        sampBin[i][on_edge] -= 1

    # Compute the sample indices in the flattened statistic matrix.
    binnumbers = np.ravel_multi_index(sampBin, nbin)

    result = np.empty([Vdim, nbin.prod()], float)

    if statistic == 'mean':
        result.fill(np.nan)
        flatcount = np.bincount(binnumbers, None)
        a = flatcount.nonzero()
        for vv in xrange(Vdim):
            flatsum = np.bincount(binnumbers, values[vv])
            result[vv, a] = flatsum[a] / flatcount[a]
    elif statistic == 'std':
        result.fill(0)
        flatcount = np.bincount(binnumbers, None)
        a = flatcount.nonzero()
        for vv in xrange(Vdim):
            flatsum = np.bincount(binnumbers, values[vv])
            flatsum2 = np.bincount(binnumbers, values[vv]**2)
            result[vv, a] = np.sqrt(flatsum2[a] / flatcount[a] -
                                    (flatsum[a] / flatcount[a])**2)
    elif statistic == 'count':
        result.fill(0)
        flatcount = np.bincount(binnumbers, None)
        a = np.arange(len(flatcount))
        result[:, a] = flatcount[np.newaxis, :]
    elif statistic == 'sum':
        result.fill(0)
        for vv in xrange(Vdim):
            flatsum = np.bincount(binnumbers, values[vv])
            a = np.arange(len(flatsum))
            result[vv, a] = flatsum
    elif statistic == 'median':
        result.fill(np.nan)
        for i in np.unique(binnumbers):
            for vv in xrange(Vdim):
                result[vv, i] = np.median(values[vv, binnumbers == i])
    elif statistic == 'min':
        result.fill(np.nan)
        for i in np.unique(binnumbers):
            for vv in xrange(Vdim):
                result[vv, i] = np.min(values[vv, binnumbers == i])
    elif statistic == 'max':
        result.fill(np.nan)
        for i in np.unique(binnumbers):
            for vv in xrange(Vdim):
                result[vv, i] = np.max(values[vv, binnumbers == i])
    elif callable(statistic):
        with np.errstate(invalid='ignore'), suppress_warnings() as sup:
            sup.filter(RuntimeWarning)
            try:
                null = statistic([], weights=[])
            except:
                null = np.nan
        result.fill(null)
        for i in np.unique(binnumbers):
            for vv in xrange(Vdim):
                if weights is None:
                    result[vv, i] = statistic(values[vv, binnumbers == i])
                else:
                    result[vv, i] = statistic(values[vv, binnumbers == i],
                                              weights[vv, binnumbers == i])

    # Shape into a proper matrix
    result = result.reshape(np.append(Vdim, nbin))

    # Remove outliers (indices 0 and -1 for each bin-dimension).
    core = [slice(None)] + Ndim * [slice(1, -1)]
    result = result[core]

    # Unravel binnumbers into an ndarray, each row the bins for each dimension
    if (expand_binnumbers and Ndim > 1):
        binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))

    if np.any(result.shape[1:] != nbin - 2):
        raise RuntimeError('Internal Shape Error')

    # Reshape to have output (`reulst`) match input (`values`) shape
    result = result.reshape(input_shape[:-1] + list(nbin - 2))

    return result, edges, binnumbers


# put back the documentation
from scipy.stats import binned_statistic as old_binned_statistic,\
                        binned_statistic_dd as old_binned_statistic_dd
doc1 = old_binned_statistic.__doc__
doc2 = old_binned_statistic_dd.__doc__
binned_statistic.__doc__ = doc1
binned_statistic_dd.__doc__ = doc2

###############################################################################


def binRV(time, rv, err=None, stat='wmean', tstat='wmean', estat='addquad'):
    """ Bin a dataset of radial-velocity observations.

    Parameters
    ----------
    time : array
        The array of times where the radial velocity is measured. This function
        does nightly binning, based on the integer part of this array.
    rv : array
        The radial-velocity values.
    err : array (optional)
        The uncertainty on the radial-velocity values.
    stat : string or callable (optional)
        The statistic to compute for the radial velocities. By default, this is
        'mean' if the `err` array is not provided and weighted mean ('wmean')
        if `err` is provided. In that case, the routine does inverse variance
        averaging, using 1/`err`**2 as weights. See other available statistics
        on the `binned_statistic` function.
    tstat : string or callable (optional)
        The statistic to compute for the times. The default is the same as for
        the radial velocities. See other available statistics on the
        `binned_statistic` function.
    estat : string or callable (optional)
        The statistic to compute for the errors. The default is to add them
        quadratically and divide by the number. See other available statistics
        on the `binned_statistic` function.

    Notes
    -----
    Arguably, the most justified way to perform binning is to use the defaults,
    with times, radial velocities and errors as input. This will lead to a
    weighted average of the times, a weighted average of the radial velocities
    and quadratically added errors.
    """

    # "nightly" binning, based on the integer part of the time array
    intt = time.astype(int)
    _, indices = np.unique(intt, return_index=True)

    # include the last time in the array of bins, so we don't miss a point
    # but be careful: if it is alone, perturb the last time by a small ammount
    # so that the difference time[-1] - bins[-1] is not zero
    if indices[-1] == time.size - 1:
        bins = np.r_[time[indices], time[-1] + 1e-10]
    else:
        bins = np.r_[time[indices], time[-1]]

    # weighted mean
    wmean = lambda a, e: np.average(a, weights=1 / e**2)

    # bin the RVs
    if (err is not None) and (stat == 'wmean'):
        stat = wmean  # default is weighted mean

    brv = binned_statistic(time, rv, statistic=stat, bins=bins, range=None,
                           weights=err)

    # bin the times
    if (err is not None) and (tstat == 'wmean'):
        tstat = wmean  # default is weighted mean

    times = binned_statistic(time, time, statistic=tstat, bins=bins,
                             range=None, weights=err)

    # if there are errors, bin them too
    if err is not None:
        if estat == 'addquad':
            # this function adds the elements of `a` quadratically
            add_quadratically = lambda a: np.sqrt(np.add.reduce(a**2))
            # this function then divides by the number of elements
            mean_quadratically = lambda a: add_quadratically(a) / a.size
            # the two previous functions are only separated for clarity
            # bin
            estat = mean_quadratically
            errors = binned_statistic(time, err, statistic=estat, bins=bins)
        else:
            errors = binned_statistic(time, err, statistic=estat, bins=bins)

        return times[0], brv[0], errors[0]
    else:
        return times[0], brv[0]
	import numpy as np

	###############################################################################
	# the following is mostly a copy of the scipy implementation of
	# binned_statistic and binned_statistic_dd
	# but allowing for a weights parameter
	from scipy._lib.six import callable, xrange
	from scipy._lib._numpy_compat import suppress_warnings

	## careful here!
	import warnings
	warnings.simplefilter(action='ignore', category=FutureWarning)


	def binned_statistic(x, values, statistic='mean', bins=10, range=None,
	weights=None):
	try:
	N = len(bins)
	except TypeError:
	N = 1

	if N != 1:
	bins = [np.asarray(bins, float)]

	if range is not None:
	if len(range) == 2:
	range = [range]

	medians, edges, binnumbers = binned_statistic_dd(
	[x], values, statistic, bins, range, weights)

	return medians, edges[0], binnumbers


	def binned_statistic_dd(sample, values, statistic='mean', bins=10, range=None,
	weights=None, expand_binnumbers=False):
	known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
	if not callable(statistic) and statistic not in known_stats:
	raise ValueError('invalid statistic %r' % (statistic, ))

	# `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
	# `Dlen` is the length of elements along each dimension.
	# This code is based on np.histogramdd
	try:
	# `sample` is an ND-array.
	Dlen, Ndim = sample.shape
	except (AttributeError, ValueError):
	# `sample` is a sequence of 1D arrays.
	sample = np.atleast_2d(sample).T
	Dlen, Ndim = sample.shape

	# Store initial shape of `values` to preserve it in the output
	values = np.asarray(values)
	input_shape = list(values.shape)
	# Make sure that `values` is 2D to iterate over rows
	values = np.atleast_2d(values)
	if weights is not None:
	weights = np.atleast_2d(weights)
	Vdim, Vlen = values.shape

	# Make sure `values` match `sample`
	if (statistic != 'count' and Vlen != Dlen):
	raise AttributeError('The number of `values` elements must match the '
	'length of each `sample` dimension.')

	nbin = np.empty(Ndim, int) # Number of bins in each dimension
	edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
	dedges = Ndim * [None] # Spacing between edges (will be 2D array)

	try:
	M = len(bins)
	if M != Ndim:
	raise AttributeError('The dimension of bins must be equal '
	'to the dimension of the sample x.')
	except TypeError:
	bins = Ndim * [bins]

	# Select range for each dimension
	# Used only if number of bins is given.
	if range is None:
	smin = np.atleast_1d(np.array(sample.min(axis=0), float))
	smax = np.atleast_1d(np.array(sample.max(axis=0), float))
	else:
	smin = np.zeros(Ndim)
	smax = np.zeros(Ndim)
	for i in xrange(Ndim):
	smin[i], smax[i] = range[i]

	# Make sure the bins have a finite width.
	for i in xrange(len(smin)):
	if smin[i] == smax[i]:
	smin[i] = smin[i] - .5
	smax[i] = smax[i] + .5

	# Create edge arrays
	for i in xrange(Ndim):
	if np.isscalar(bins[i]):
	nbin[i] = bins[i] + 2 # +2 for outlier bins
	edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1)
	else:
	edges[i] = np.asarray(bins[i], float)
	nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
	dedges[i] = np.diff(edges[i])

	nbin = np.asarray(nbin)

	# Compute the bin number each sample falls into, in each dimension
	sampBin = [np.digitize(sample[:, i], edges[i]) for i in xrange(Ndim)]

	# Using `digitize`, values that fall on an edge are put in the right bin.
	# For the rightmost bin, we want values equal to the right
	# edge to be counted in the last bin, and not as an outlier.
	for i in xrange(Ndim):
	# Find the rounding precision
	decimal = int(-np.log10(dedges[i].min())) + 6
	# Find which points are on the rightmost edge.
	on_edge = np.where(
	np.around(sample[:, i], decimal) == np.around(
	edges[i][-1], decimal))[0]
	# Shift these points one bin to the left.
	sampBin[i][on_edge] -= 1

	# Compute the sample indices in the flattened statistic matrix.
	binnumbers = np.ravel_multi_index(sampBin, nbin)

	result = np.empty([Vdim, nbin.prod()], float)

	if statistic == 'mean':
	result.fill(np.nan)
	flatcount = np.bincount(binnumbers, None)
	a = flatcount.nonzero()
	for vv in xrange(Vdim):
	flatsum = np.bincount(binnumbers, values[vv])
	result[vv, a] = flatsum[a] / flatcount[a]
	elif statistic == 'std':
	result.fill(0)
	flatcount = np.bincount(binnumbers, None)
	a = flatcount.nonzero()
	for vv in xrange(Vdim):
	flatsum = np.bincount(binnumbers, values[vv])
	flatsum2 = np.bincount(binnumbers, values[vv]**2)
	result[vv, a] = np.sqrt(flatsum2[a] / flatcount[a] -
	(flatsum[a] / flatcount[a])**2)
	elif statistic == 'count':
	result.fill(0)
	flatcount = np.bincount(binnumbers, None)
	a = np.arange(len(flatcount))
	result[:, a] = flatcount[np.newaxis, :]
	elif statistic == 'sum':
	result.fill(0)
	for vv in xrange(Vdim):
	flatsum = np.bincount(binnumbers, values[vv])
	a = np.arange(len(flatsum))
	result[vv, a] = flatsum
	elif statistic == 'median':
	result.fill(np.nan)
	for i in np.unique(binnumbers):
	for vv in xrange(Vdim):
	result[vv, i] = np.median(values[vv, binnumbers == i])
	elif statistic == 'min':
	result.fill(np.nan)
	for i in np.unique(binnumbers):
	for vv in xrange(Vdim):
	result[vv, i] = np.min(values[vv, binnumbers == i])
	elif statistic == 'max':
	result.fill(np.nan)
	for i in np.unique(binnumbers):
	for vv in xrange(Vdim):
	result[vv, i] = np.max(values[vv, binnumbers == i])
	elif callable(statistic):
	with np.errstate(invalid='ignore'), suppress_warnings() as sup:
	sup.filter(RuntimeWarning)
	try:
	null = statistic([], weights=[])
	except:
	null = np.nan
	result.fill(null)
	for i in np.unique(binnumbers):
	for vv in xrange(Vdim):
	if weights is None:
	result[vv, i] = statistic(values[vv, binnumbers == i])
	else:
	result[vv, i] = statistic(values[vv, binnumbers == i],
	weights[vv, binnumbers == i])

	# Shape into a proper matrix
	result = result.reshape(np.append(Vdim, nbin))

	# Remove outliers (indices 0 and -1 for each bin-dimension).
	core = [slice(None)] + Ndim * [slice(1, -1)]
	result = result[core]

	# Unravel binnumbers into an ndarray, each row the bins for each dimension
	if (expand_binnumbers and Ndim > 1):
	binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))

	if np.any(result.shape[1:] != nbin - 2):
	raise RuntimeError('Internal Shape Error')

	# Reshape to have output (`reulst`) match input (`values`) shape
	result = result.reshape(input_shape[:-1] + list(nbin - 2))

	return result, edges, binnumbers


	# put back the documentation
	from scipy.stats import binned_statistic as old_binned_statistic,\
	binned_statistic_dd as old_binned_statistic_dd
	doc1 = old_binned_statistic.__doc__
	doc2 = old_binned_statistic_dd.__doc__
	binned_statistic.__doc__ = doc1
	binned_statistic_dd.__doc__ = doc2

	###############################################################################


	def binRV(time, rv, err=None, stat='wmean', tstat='wmean', estat='addquad'):
	""" Bin a dataset of radial-velocity observations.

	Parameters
	----------
	time : array
	The array of times where the radial velocity is measured. This function
	does nightly binning, based on the integer part of this array.
	rv : array
	The radial-velocity values.
	err : array (optional)
	The uncertainty on the radial-velocity values.
	stat : string or callable (optional)
	The statistic to compute for the radial velocities. By default, this is
	'mean' if the `err` array is not provided and weighted mean ('wmean')
	if `err` is provided. In that case, the routine does inverse variance
	averaging, using 1/`err`**2 as weights. See other available statistics
	on the `binned_statistic` function.
	tstat : string or callable (optional)
	The statistic to compute for the times. The default is the same as for
	the radial velocities. See other available statistics on the
	`binned_statistic` function.
	estat : string or callable (optional)
	The statistic to compute for the errors. The default is to add them
	quadratically and divide by the number. See other available statistics
	on the `binned_statistic` function.

	Notes
	-----
	Arguably, the most justified way to perform binning is to use the defaults,
	with times, radial velocities and errors as input. This will lead to a
	weighted average of the times, a weighted average of the radial velocities
	and quadratically added errors.
	"""

	# "nightly" binning, based on the integer part of the time array
	intt = time.astype(int)
	_, indices = np.unique(intt, return_index=True)

	# include the last time in the array of bins, so we don't miss a point
	# but be careful: if it is alone, perturb the last time by a small ammount
	# so that the difference time[-1] - bins[-1] is not zero
	if indices[-1] == time.size - 1:
	bins = np.r_[time[indices], time[-1] + 1e-10]
	else:
	bins = np.r_[time[indices], time[-1]]

	# weighted mean
	wmean = lambda a, e: np.average(a, weights=1 / e**2)

	# bin the RVs
	if (err is not None) and (stat == 'wmean'):
	stat = wmean # default is weighted mean

	brv = binned_statistic(time, rv, statistic=stat, bins=bins, range=None,
	weights=err)

	# bin the times
	if (err is not None) and (tstat == 'wmean'):
	tstat = wmean # default is weighted mean

	times = binned_statistic(time, time, statistic=tstat, bins=bins,
	range=None, weights=err)

	# if there are errors, bin them too
	if err is not None:
	if estat == 'addquad':
	# this function adds the elements of `a` quadratically
	add_quadratically = lambda a: np.sqrt(np.add.reduce(a**2))
	# this function then divides by the number of elements
	mean_quadratically = lambda a: add_quadratically(a) / a.size
	# the two previous functions are only separated for clarity
	# bin
	estat = mean_quadratically
	errors = binned_statistic(time, err, statistic=estat, bins=bins)
	else:
	errors = binned_statistic(time, err, statistic=estat, bins=bins)

	return times[0], brv[0], errors[0]
	else:
	return times[0], brv[0]