ranaroussi/resampler.py

## resampler.py
import pandas as pd

def resample(data, resolution="1T", tz=None, ffill=True, dropna=False):
    """
    >>> resample(df, '500K') # resample 500 ticks (by counting trades)
    >>> resample(df, '500V') # resample ~500 volume (by counting volume)
    """

    def __finalize(data, tz=None):
        # figure out timezone
        try:
            tz = data.index.tz if tz is None else tz
        except Exception:
            pass

        if str(tz) != 'None':
            try:
                data.index = data.index.tz_convert(tz)
            except Exception:
                data.index = data.index.tz_localize('UTC').tz_convert(tz)

        # sort by index (datetime)
        data.sort_index(inplace=True)

        # drop duplicate rows per instrument
        data.loc[:, '_idx_'] = data.index
        data.drop_duplicates(subset=['_idx_'], keep='last', inplace=True)
        data.drop('_idx_', axis=1, inplace=True)

        return data
        # return data[~data.index.duplicated(keep='last')]

    def __resample_ticks(data, freq=1000, by='last'):
        """
        function that re-samples tick data into an N-tick or N-volume OHLC

        df = pandas pd.dataframe of raw tick data
        freq = resoltuin grouping
        by = the column name to resample by
        """

        data.fillna(value=np.nan, inplace=True)

        # get only ticks and fill missing data
        try:
            df = data[['last', 'lastsize']].copy()
            price_col = 'last'
            size_col = 'lastsize'
        except Exception:
            df = data[['close', 'volume']].copy()
            price_col = 'close'
            size_col = 'volume'

        # add group indicator evey N df
        if by == 'size' or by == 'lastsize' or by == 'volume':
            df['cumvol'] = df[size_col].cumsum()
            df['mark'] = freq * (round((df['cumvol'] / freq) / .1) * .1 // 1)
            df['diff'] = df['mark'].diff().fillna(0).astype(int)
            df['grp'] = np.where(df['diff'] >= freq - 1,
                                 (df['mark'] / freq), np.nan)
        else:
            df['grp'] = [np.nan if i %
                         freq else i for i in range(len(df[price_col]))]

        df.loc[:1, 'grp'] = 0

        df.fillna(method='ffill', inplace=True)

        # place timestamp index in T colums
        # (to be used as future df index)
        df['T'] = df.index

        # make group the index
        df = df.set_index('grp')

        # grop df
        groupped = df.groupby(df.index, sort=False)

        # build ohlc(v) pd.dataframe from new grp column
        newdf = pd.DataFrame({
            'open':   groupped[price_col].first(),
            'high':   groupped[price_col].max(),
            'low':    groupped[price_col].min(),
            'close':  groupped[price_col].last(),
            'volume': groupped[size_col].sum()
        })

        # set index to timestamp
        newdf['datetime'] = groupped.T.head(1)
        newdf.set_index(['datetime'], inplace=True)

        return newdf

    if data.empty:
        return __finalize(data, tz)

    # ---------------------------------------------
    # resample
    data.columns = map(str.lower, data.columns)

    periods = int("".join([s for s in resolution if s.isdigit()]))
    combined = []

    if "K" in resolution:
        if periods > 1:
            data = __resample_ticks(data.copy(), freq=periods, by='last')
            data.dropna(inplace=True, subset=[
                        'open', 'high', 'low', 'close', 'volume'])
            return data
        return data

    if "V" in resolution:
        if periods > 1:
            data = __resample_ticks(data.copy(), freq=periods, by='lastsize')
            data.dropna(inplace=True, subset=[
                        'open', 'high', 'low', 'close', 'volume'])
            return data
        return data

    # continue...
    if "last" in data.columns:
        ohlc = data['last'].resample(resolution).ohlc()
        data = data.resample(resolution).apply({'lastsize': 'sum'}).fillna(value=np.nan)
        data.rename(columns={'lastsize': 'volume'}, inplace=True)
        data['open'] = ohlc['open']
        data['high'] = ohlc['high']
        data['low'] = ohlc['low']
        data['close'] = ohlc['close']

    else:
        original_length = len(data)
        data = data.resample(resolution).apply({
            'open': 'first',
            'high': 'max',
            'low': 'min',
            'close': 'last',
            'volume': 'sum',
        }).fillna(value=np.nan)

        # deal with new rows caused by resample
        if len(data) > original_length:
            # volume is 0 on rows created using resample
            data['volume'].fillna(0, inplace=True)
            data.ffill(inplace=True)

            # no fill / return original index
            filler = data['close'] if ffill else np.nan
            data['open'] = np.where(
                data['volume'] <= 0, filler, data['open'])
            data['high'] = np.where(
                data['volume'] <= 0, filler, data['high'])
            data['low'] = np.where(
                data['volume'] <= 0, filler, data['low'])

        # drop NANs
        if dropna:
            symdata.dropna(inplace=True)

        # cleanup
        data.dropna(inplace=True, subset=[
            'open', 'high', 'low', 'close', 'volume'])

    data['volume'] = data['volume'].astype(int)

    return __finalize(data, tz)
	import pandas as pd

	def resample(data, resolution="1T", tz=None, ffill=True, dropna=False):
	"""
	>>> resample(df, '500K') # resample 500 ticks (by counting trades)
	>>> resample(df, '500V') # resample ~500 volume (by counting volume)
	"""

	def __finalize(data, tz=None):
	# figure out timezone
	try:
	tz = data.index.tz if tz is None else tz
	except Exception:
	pass

	if str(tz) != 'None':
	try:
	data.index = data.index.tz_convert(tz)
	except Exception:
	data.index = data.index.tz_localize('UTC').tz_convert(tz)

	# sort by index (datetime)
	data.sort_index(inplace=True)

	# drop duplicate rows per instrument
	data.loc[:, '_idx_'] = data.index
	data.drop_duplicates(subset=['_idx_'], keep='last', inplace=True)
	data.drop('_idx_', axis=1, inplace=True)

	return data
	# return data[~data.index.duplicated(keep='last')]

	def __resample_ticks(data, freq=1000, by='last'):
	"""
	function that re-samples tick data into an N-tick or N-volume OHLC

	df = pandas pd.dataframe of raw tick data
	freq = resoltuin grouping
	by = the column name to resample by
	"""

	data.fillna(value=np.nan, inplace=True)

	# get only ticks and fill missing data
	try:
	df = data[['last', 'lastsize']].copy()
	price_col = 'last'
	size_col = 'lastsize'
	except Exception:
	df = data[['close', 'volume']].copy()
	price_col = 'close'
	size_col = 'volume'

	# add group indicator evey N df
	if by == 'size' or by == 'lastsize' or by == 'volume':
	df['cumvol'] = df[size_col].cumsum()
	df['mark'] = freq * (round((df['cumvol'] / freq) / .1) * .1 // 1)
	df['diff'] = df['mark'].diff().fillna(0).astype(int)
	df['grp'] = np.where(df['diff'] >= freq - 1,
	(df['mark'] / freq), np.nan)
	else:
	df['grp'] = [np.nan if i %
	freq else i for i in range(len(df[price_col]))]

	df.loc[:1, 'grp'] = 0

	df.fillna(method='ffill', inplace=True)

	# place timestamp index in T colums
	# (to be used as future df index)
	df['T'] = df.index

	# make group the index
	df = df.set_index('grp')

	# grop df
	groupped = df.groupby(df.index, sort=False)

	# build ohlc(v) pd.dataframe from new grp column
	newdf = pd.DataFrame({
	'open': groupped[price_col].first(),
	'high': groupped[price_col].max(),
	'low': groupped[price_col].min(),
	'close': groupped[price_col].last(),
	'volume': groupped[size_col].sum()
	})

	# set index to timestamp
	newdf['datetime'] = groupped.T.head(1)
	newdf.set_index(['datetime'], inplace=True)

	return newdf

	if data.empty:
	return __finalize(data, tz)

	# ---------------------------------------------
	# resample
	data.columns = map(str.lower, data.columns)

	periods = int("".join([s for s in resolution if s.isdigit()]))
	combined = []

	if "K" in resolution:
	if periods > 1:
	data = __resample_ticks(data.copy(), freq=periods, by='last')
	data.dropna(inplace=True, subset=[
	'open', 'high', 'low', 'close', 'volume'])
	return data
	return data

	if "V" in resolution:
	if periods > 1:
	data = __resample_ticks(data.copy(), freq=periods, by='lastsize')
	data.dropna(inplace=True, subset=[
	'open', 'high', 'low', 'close', 'volume'])
	return data
	return data

	# continue...
	if "last" in data.columns:
	ohlc = data['last'].resample(resolution).ohlc()
	data = data.resample(resolution).apply({'lastsize': 'sum'}).fillna(value=np.nan)
	data.rename(columns={'lastsize': 'volume'}, inplace=True)
	data['open'] = ohlc['open']
	data['high'] = ohlc['high']
	data['low'] = ohlc['low']
	data['close'] = ohlc['close']

	else:
	original_length = len(data)
	data = data.resample(resolution).apply({
	'open': 'first',
	'high': 'max',
	'low': 'min',
	'close': 'last',
	'volume': 'sum',
	}).fillna(value=np.nan)

	# deal with new rows caused by resample
	if len(data) > original_length:
	# volume is 0 on rows created using resample
	data['volume'].fillna(0, inplace=True)
	data.ffill(inplace=True)

	# no fill / return original index
	filler = data['close'] if ffill else np.nan
	data['open'] = np.where(
	data['volume'] <= 0, filler, data['open'])
	data['high'] = np.where(
	data['volume'] <= 0, filler, data['high'])
	data['low'] = np.where(
	data['volume'] <= 0, filler, data['low'])

	# drop NANs
	if dropna:
	symdata.dropna(inplace=True)

	# cleanup
	data.dropna(inplace=True, subset=[
	'open', 'high', 'low', 'close', 'volume'])

	data['volume'] = data['volume'].astype(int)

	return __finalize(data, tz)