Skip to content

Instantly share code, notes, and snippets.

@ranaroussi
Created September 18, 2019 12:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ranaroussi/6c3345bf974c04f66c62e4d772959607 to your computer and use it in GitHub Desktop.
Save ranaroussi/6c3345bf974c04f66c62e4d772959607 to your computer and use it in GitHub Desktop.
QTPyLib timeseries resampler
import pandas as pd
def resample(data, resolution="1T", tz=None, ffill=True, dropna=False):
"""
>>> resample(df, '500K') # resample 500 ticks (by counting trades)
>>> resample(df, '500V') # resample ~500 volume (by counting volume)
"""
def __finalize(data, tz=None):
# figure out timezone
try:
tz = data.index.tz if tz is None else tz
except Exception:
pass
if str(tz) != 'None':
try:
data.index = data.index.tz_convert(tz)
except Exception:
data.index = data.index.tz_localize('UTC').tz_convert(tz)
# sort by index (datetime)
data.sort_index(inplace=True)
# drop duplicate rows per instrument
data.loc[:, '_idx_'] = data.index
data.drop_duplicates(subset=['_idx_'], keep='last', inplace=True)
data.drop('_idx_', axis=1, inplace=True)
return data
# return data[~data.index.duplicated(keep='last')]
def __resample_ticks(data, freq=1000, by='last'):
"""
function that re-samples tick data into an N-tick or N-volume OHLC
df = pandas pd.dataframe of raw tick data
freq = resoltuin grouping
by = the column name to resample by
"""
data.fillna(value=np.nan, inplace=True)
# get only ticks and fill missing data
try:
df = data[['last', 'lastsize']].copy()
price_col = 'last'
size_col = 'lastsize'
except Exception:
df = data[['close', 'volume']].copy()
price_col = 'close'
size_col = 'volume'
# add group indicator evey N df
if by == 'size' or by == 'lastsize' or by == 'volume':
df['cumvol'] = df[size_col].cumsum()
df['mark'] = freq * (round((df['cumvol'] / freq) / .1) * .1 // 1)
df['diff'] = df['mark'].diff().fillna(0).astype(int)
df['grp'] = np.where(df['diff'] >= freq - 1,
(df['mark'] / freq), np.nan)
else:
df['grp'] = [np.nan if i %
freq else i for i in range(len(df[price_col]))]
df.loc[:1, 'grp'] = 0
df.fillna(method='ffill', inplace=True)
# place timestamp index in T colums
# (to be used as future df index)
df['T'] = df.index
# make group the index
df = df.set_index('grp')
# grop df
groupped = df.groupby(df.index, sort=False)
# build ohlc(v) pd.dataframe from new grp column
newdf = pd.DataFrame({
'open': groupped[price_col].first(),
'high': groupped[price_col].max(),
'low': groupped[price_col].min(),
'close': groupped[price_col].last(),
'volume': groupped[size_col].sum()
})
# set index to timestamp
newdf['datetime'] = groupped.T.head(1)
newdf.set_index(['datetime'], inplace=True)
return newdf
if data.empty:
return __finalize(data, tz)
# ---------------------------------------------
# resample
data.columns = map(str.lower, data.columns)
periods = int("".join([s for s in resolution if s.isdigit()]))
combined = []
if "K" in resolution:
if periods > 1:
data = __resample_ticks(data.copy(), freq=periods, by='last')
data.dropna(inplace=True, subset=[
'open', 'high', 'low', 'close', 'volume'])
return data
return data
if "V" in resolution:
if periods > 1:
data = __resample_ticks(data.copy(), freq=periods, by='lastsize')
data.dropna(inplace=True, subset=[
'open', 'high', 'low', 'close', 'volume'])
return data
return data
# continue...
if "last" in data.columns:
ohlc = data['last'].resample(resolution).ohlc()
data = data.resample(resolution).apply({'lastsize': 'sum'}).fillna(value=np.nan)
data.rename(columns={'lastsize': 'volume'}, inplace=True)
data['open'] = ohlc['open']
data['high'] = ohlc['high']
data['low'] = ohlc['low']
data['close'] = ohlc['close']
else:
original_length = len(data)
data = data.resample(resolution).apply({
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last',
'volume': 'sum',
}).fillna(value=np.nan)
# deal with new rows caused by resample
if len(data) > original_length:
# volume is 0 on rows created using resample
data['volume'].fillna(0, inplace=True)
data.ffill(inplace=True)
# no fill / return original index
filler = data['close'] if ffill else np.nan
data['open'] = np.where(
data['volume'] <= 0, filler, data['open'])
data['high'] = np.where(
data['volume'] <= 0, filler, data['high'])
data['low'] = np.where(
data['volume'] <= 0, filler, data['low'])
# drop NANs
if dropna:
symdata.dropna(inplace=True)
# cleanup
data.dropna(inplace=True, subset=[
'open', 'high', 'low', 'close', 'volume'])
data['volume'] = data['volume'].astype(int)
return __finalize(data, tz)
@ranaroussi
Copy link
Author

You need to make sure you have a last/lastsize or close/volume columns for tick data or ohlcv for second+ level bar data.

df = pd.read_csv('sample.csv', parse_dates=['Date', 'Time'], 
                index_col='Time')[['Volume','Unfiltered Price']]
df.columns = ['lastsize', 'last']

# for ~500 VOLUME
resampled = resample(df, '500V')

# for 500 TICKS (trades)
resampled = resample(df, '500K')

# for 1 MINUTE, etc: use pandas resample flags
resampled = resample(df, '1T')

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment