-
-
Save neocortex/5d962742ef16b072dee9a04d9015f85d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def resample(tseries, rate='15T', short_rate='S', max_gap=None): | |
""" Resample (unevenly spaced) timeseries data linearly by first upsampling to a | |
high frequency (short_rate) then downsampling to the desired rate. | |
:param tseries: a pandas timeseries object | |
:param rate: rate that tseries should be resampled to | |
:param short_rate: intermediate upsampling rate; if None, smallest interval of tseries is used | |
:param max_gap: null intervals larger than `max_gap` are being treated as missing | |
data and not interpolated. if None, always interpolate. must be provided as pandas | |
frequency string format, e.g. '6h' | |
Copyright (c) 2017 WATTx GmbH | |
License: Apache License | |
""" | |
# return series if empty | |
if tseries.empty: | |
return tseries | |
# check for datetime index | |
assert isinstance( | |
tseries.index[0], pd.tslib.Timestamp), 'Object must have a datetime-like index.' | |
# sort tseries by time | |
tseries.sort_index(inplace=True) | |
# create timedelta from frequency string | |
rate_delta = to_offset(rate).delta | |
# compute time intervals | |
diff = np.diff(tseries.index) / np.timedelta64(1, 's') | |
if max_gap is not None: | |
# identify intervals in tseries larger than max_gap | |
idx = np.where(np.greater(diff, to_offset(max_gap).delta.total_seconds()))[0] | |
start = tseries.index[idx].tolist() | |
stop = tseries.index[idx + 1].tolist() | |
# store start and stop indices of large intervals | |
big_gaps = list(zip(start, stop)) | |
if short_rate is None: | |
# use minimal nonzero interval of original series as short_rate | |
short_rate = '%dS' % diff[np.nonzero(diff)].min() | |
# create timedelta from frequency string | |
short_rate_delta = to_offset(short_rate).delta | |
# if smallest interval is still larger than rate, use rate instead | |
if short_rate_delta > rate_delta: | |
short_rate = rate | |
else: | |
# convert frequency string to timedelta | |
short_rate_delta = to_offset(short_rate).delta | |
# make sure entered short_rate is smaller than rate | |
assert rate_delta >= short_rate_delta, 'short_rate must be <= rate' | |
# upsample to short_rate | |
tseries = tseries.resample(short_rate, how='mean').interpolate() | |
# downsample to desired rate | |
tseries = tseries.resample(rate, how='ffill') | |
# replace values in large gap itervals with NaN | |
if max_gap is not None: | |
for start, stop in big_gaps: | |
tseries[start:stop] = None | |
return tseries |
An alternative that might be more efficient is to use reindex (https://stackoverflow.com/a/40035879/2981639). As far as I can see this doesn't exhibit the shift introduced by resample().mean().interpolate()? The gap filter is quite useful!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Howdy! This would be very helpful for me right now, but where is the
to_offset
function defined?Edit: Ah nevermind, I should have googled:
from pandas.tseries.frequencies import to_offset