Nightly bin a dataset of radial velocities
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
############################################################################### | |
# the following is mostly a copy of the scipy implementation of | |
# binned_statistic and binned_statistic_dd | |
# but allowing for a weights parameter | |
from scipy._lib.six import callable, xrange | |
from scipy._lib._numpy_compat import suppress_warnings | |
## careful here! | |
import warnings | |
warnings.simplefilter(action='ignore', category=FutureWarning) | |
def binned_statistic(x, values, statistic='mean', bins=10, range=None, | |
weights=None): | |
try: | |
N = len(bins) | |
except TypeError: | |
N = 1 | |
if N != 1: | |
bins = [np.asarray(bins, float)] | |
if range is not None: | |
if len(range) == 2: | |
range = [range] | |
medians, edges, binnumbers = binned_statistic_dd( | |
[x], values, statistic, bins, range, weights) | |
return medians, edges[0], binnumbers | |
def binned_statistic_dd(sample, values, statistic='mean', bins=10, range=None, | |
weights=None, expand_binnumbers=False): | |
known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max'] | |
if not callable(statistic) and statistic not in known_stats: | |
raise ValueError('invalid statistic %r' % (statistic, )) | |
# `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`) | |
# `Dlen` is the length of elements along each dimension. | |
# This code is based on np.histogramdd | |
try: | |
# `sample` is an ND-array. | |
Dlen, Ndim = sample.shape | |
except (AttributeError, ValueError): | |
# `sample` is a sequence of 1D arrays. | |
sample = np.atleast_2d(sample).T | |
Dlen, Ndim = sample.shape | |
# Store initial shape of `values` to preserve it in the output | |
values = np.asarray(values) | |
input_shape = list(values.shape) | |
# Make sure that `values` is 2D to iterate over rows | |
values = np.atleast_2d(values) | |
if weights is not None: | |
weights = np.atleast_2d(weights) | |
Vdim, Vlen = values.shape | |
# Make sure `values` match `sample` | |
if (statistic != 'count' and Vlen != Dlen): | |
raise AttributeError('The number of `values` elements must match the ' | |
'length of each `sample` dimension.') | |
nbin = np.empty(Ndim, int) # Number of bins in each dimension | |
edges = Ndim * [None] # Bin edges for each dim (will be 2D array) | |
dedges = Ndim * [None] # Spacing between edges (will be 2D array) | |
try: | |
M = len(bins) | |
if M != Ndim: | |
raise AttributeError('The dimension of bins must be equal ' | |
'to the dimension of the sample x.') | |
except TypeError: | |
bins = Ndim * [bins] | |
# Select range for each dimension | |
# Used only if number of bins is given. | |
if range is None: | |
smin = np.atleast_1d(np.array(sample.min(axis=0), float)) | |
smax = np.atleast_1d(np.array(sample.max(axis=0), float)) | |
else: | |
smin = np.zeros(Ndim) | |
smax = np.zeros(Ndim) | |
for i in xrange(Ndim): | |
smin[i], smax[i] = range[i] | |
# Make sure the bins have a finite width. | |
for i in xrange(len(smin)): | |
if smin[i] == smax[i]: | |
smin[i] = smin[i] - .5 | |
smax[i] = smax[i] + .5 | |
# Create edge arrays | |
for i in xrange(Ndim): | |
if np.isscalar(bins[i]): | |
nbin[i] = bins[i] + 2 # +2 for outlier bins | |
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1) | |
else: | |
edges[i] = np.asarray(bins[i], float) | |
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins | |
dedges[i] = np.diff(edges[i]) | |
nbin = np.asarray(nbin) | |
# Compute the bin number each sample falls into, in each dimension | |
sampBin = [np.digitize(sample[:, i], edges[i]) for i in xrange(Ndim)] | |
# Using `digitize`, values that fall on an edge are put in the right bin. | |
# For the rightmost bin, we want values equal to the right | |
# edge to be counted in the last bin, and not as an outlier. | |
for i in xrange(Ndim): | |
# Find the rounding precision | |
decimal = int(-np.log10(dedges[i].min())) + 6 | |
# Find which points are on the rightmost edge. | |
on_edge = np.where( | |
np.around(sample[:, i], decimal) == np.around( | |
edges[i][-1], decimal))[0] | |
# Shift these points one bin to the left. | |
sampBin[i][on_edge] -= 1 | |
# Compute the sample indices in the flattened statistic matrix. | |
binnumbers = np.ravel_multi_index(sampBin, nbin) | |
result = np.empty([Vdim, nbin.prod()], float) | |
if statistic == 'mean': | |
result.fill(np.nan) | |
flatcount = np.bincount(binnumbers, None) | |
a = flatcount.nonzero() | |
for vv in xrange(Vdim): | |
flatsum = np.bincount(binnumbers, values[vv]) | |
result[vv, a] = flatsum[a] / flatcount[a] | |
elif statistic == 'std': | |
result.fill(0) | |
flatcount = np.bincount(binnumbers, None) | |
a = flatcount.nonzero() | |
for vv in xrange(Vdim): | |
flatsum = np.bincount(binnumbers, values[vv]) | |
flatsum2 = np.bincount(binnumbers, values[vv]**2) | |
result[vv, a] = np.sqrt(flatsum2[a] / flatcount[a] - | |
(flatsum[a] / flatcount[a])**2) | |
elif statistic == 'count': | |
result.fill(0) | |
flatcount = np.bincount(binnumbers, None) | |
a = np.arange(len(flatcount)) | |
result[:, a] = flatcount[np.newaxis, :] | |
elif statistic == 'sum': | |
result.fill(0) | |
for vv in xrange(Vdim): | |
flatsum = np.bincount(binnumbers, values[vv]) | |
a = np.arange(len(flatsum)) | |
result[vv, a] = flatsum | |
elif statistic == 'median': | |
result.fill(np.nan) | |
for i in np.unique(binnumbers): | |
for vv in xrange(Vdim): | |
result[vv, i] = np.median(values[vv, binnumbers == i]) | |
elif statistic == 'min': | |
result.fill(np.nan) | |
for i in np.unique(binnumbers): | |
for vv in xrange(Vdim): | |
result[vv, i] = np.min(values[vv, binnumbers == i]) | |
elif statistic == 'max': | |
result.fill(np.nan) | |
for i in np.unique(binnumbers): | |
for vv in xrange(Vdim): | |
result[vv, i] = np.max(values[vv, binnumbers == i]) | |
elif callable(statistic): | |
with np.errstate(invalid='ignore'), suppress_warnings() as sup: | |
sup.filter(RuntimeWarning) | |
try: | |
null = statistic([], weights=[]) | |
except: | |
null = np.nan | |
result.fill(null) | |
for i in np.unique(binnumbers): | |
for vv in xrange(Vdim): | |
if weights is None: | |
result[vv, i] = statistic(values[vv, binnumbers == i]) | |
else: | |
result[vv, i] = statistic(values[vv, binnumbers == i], | |
weights[vv, binnumbers == i]) | |
# Shape into a proper matrix | |
result = result.reshape(np.append(Vdim, nbin)) | |
# Remove outliers (indices 0 and -1 for each bin-dimension). | |
core = [slice(None)] + Ndim * [slice(1, -1)] | |
result = result[core] | |
# Unravel binnumbers into an ndarray, each row the bins for each dimension | |
if (expand_binnumbers and Ndim > 1): | |
binnumbers = np.asarray(np.unravel_index(binnumbers, nbin)) | |
if np.any(result.shape[1:] != nbin - 2): | |
raise RuntimeError('Internal Shape Error') | |
# Reshape to have output (`reulst`) match input (`values`) shape | |
result = result.reshape(input_shape[:-1] + list(nbin - 2)) | |
return result, edges, binnumbers | |
# put back the documentation | |
from scipy.stats import binned_statistic as old_binned_statistic,\ | |
binned_statistic_dd as old_binned_statistic_dd | |
doc1 = old_binned_statistic.__doc__ | |
doc2 = old_binned_statistic_dd.__doc__ | |
binned_statistic.__doc__ = doc1 | |
binned_statistic_dd.__doc__ = doc2 | |
############################################################################### | |
def binRV(time, rv, err=None, stat='wmean', tstat='wmean', estat='addquad'): | |
""" Bin a dataset of radial-velocity observations. | |
Parameters | |
---------- | |
time : array | |
The array of times where the radial velocity is measured. This function | |
does nightly binning, based on the integer part of this array. | |
rv : array | |
The radial-velocity values. | |
err : array (optional) | |
The uncertainty on the radial-velocity values. | |
stat : string or callable (optional) | |
The statistic to compute for the radial velocities. By default, this is | |
'mean' if the `err` array is not provided and weighted mean ('wmean') | |
if `err` is provided. In that case, the routine does inverse variance | |
averaging, using 1/`err`**2 as weights. See other available statistics | |
on the `binned_statistic` function. | |
tstat : string or callable (optional) | |
The statistic to compute for the times. The default is the same as for | |
the radial velocities. See other available statistics on the | |
`binned_statistic` function. | |
estat : string or callable (optional) | |
The statistic to compute for the errors. The default is to add them | |
quadratically and divide by the number. See other available statistics | |
on the `binned_statistic` function. | |
Notes | |
----- | |
Arguably, the most justified way to perform binning is to use the defaults, | |
with times, radial velocities and errors as input. This will lead to a | |
weighted average of the times, a weighted average of the radial velocities | |
and quadratically added errors. | |
""" | |
# "nightly" binning, based on the integer part of the time array | |
intt = time.astype(int) | |
_, indices = np.unique(intt, return_index=True) | |
# include the last time in the array of bins, so we don't miss a point | |
# but be careful: if it is alone, perturb the last time by a small ammount | |
# so that the difference time[-1] - bins[-1] is not zero | |
if indices[-1] == time.size - 1: | |
bins = np.r_[time[indices], time[-1] + 1e-10] | |
else: | |
bins = np.r_[time[indices], time[-1]] | |
# weighted mean | |
wmean = lambda a, e: np.average(a, weights=1 / e**2) | |
# bin the RVs | |
if (err is not None) and (stat == 'wmean'): | |
stat = wmean # default is weighted mean | |
brv = binned_statistic(time, rv, statistic=stat, bins=bins, range=None, | |
weights=err) | |
# bin the times | |
if (err is not None) and (tstat == 'wmean'): | |
tstat = wmean # default is weighted mean | |
times = binned_statistic(time, time, statistic=tstat, bins=bins, | |
range=None, weights=err) | |
# if there are errors, bin them too | |
if err is not None: | |
if estat == 'addquad': | |
# this function adds the elements of `a` quadratically | |
add_quadratically = lambda a: np.sqrt(np.add.reduce(a**2)) | |
# this function then divides by the number of elements | |
mean_quadratically = lambda a: add_quadratically(a) / a.size | |
# the two previous functions are only separated for clarity | |
# bin | |
estat = mean_quadratically | |
errors = binned_statistic(time, err, statistic=estat, bins=bins) | |
else: | |
errors = binned_statistic(time, err, statistic=estat, bins=bins) | |
return times[0], brv[0], errors[0] | |
else: | |
return times[0], brv[0] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment