yhilpisch/big_tstables.py

## big_tstables.py
#
# TsTables -- High Frequency Times Series Data with PyTables
#
# to install -- pip install tstables
# Github repo -- https://github.com/afiedler/tstables
#
# Dr. Yves J. Hilpisch
#
# The Python Quants GmbH
# http://quant-platform.com
# http://pythonquants.com
#
import os
import numpy as np
import pandas as pd
import tables as tb
import random
from time import time
import tstables
from datetime import datetime

## Preliminaries

# First, some basic assumptions.

path = '/notebooks/ssd/data/'  # delete or adjust

co = 10  # number of time series
dt = 1. / (12 * 30 * 24 * 60)  # second as year fraction
vol = 0.2  # volatility for stochastic processes


# Second, a function to generate paths based on geometric Brownian motion.

def generate_paths(no, initials):
    paths = initials * np.exp(np.cumsum(-0.5 * vol ** 2 * dt +
            vol * np.sqrt(dt) * np.random.standard_normal((no, co)), axis=0))
    paths[0] = initials
    return paths


## Sample Data and Storage

# To store the time series data in a PyTables table we define
# the table structure.

class TS(tb.IsDescription):
    timestamp = tb.Int64Col(pos=0)
    ts1 = tb.Float64Col(pos=1)
    ts2 = tb.Float64Col(pos=2)
    ts3 = tb.Float64Col(pos=3)
    ts4 = tb.Float64Col(pos=4)
    ts5 = tb.Float64Col(pos=5)
    ts6 = tb.Float64Col(pos=6)
    ts7 = tb.Float64Col(pos=7)
    ts8 = tb.Float64Col(pos=8)
    ts9 = tb.Float64Col(pos=9)
    ts10 = tb.Float64Col(pos=10)


# Then open a database file and create the table object.

h5 = tb.open_file(path + 'ts_sec.h5', 'w')


# TsTables adds a new function ``create_ts`` to PyTables.

ts = h5.create_ts('/', 'TS', TS)


# We can now generate and write sample data to the table object.

t0 = time()
start = datetime(2015, 1, 1, 0, 0)
initials = np.array([100.] * co)
total = 0
for i in xrange(18):
    if start.month == 12:
        end = datetime(start.year + 1, 1, 1)
    else:
    	end = datetime(start.year, start.month + 1, 1)
    index = pd.date_range(start, end, freq='1s')
    start = end
    data = pd.DataFrame(generate_paths(len(index), initials),
                        index=index)
    initials = data.values[-1]
    print i, len(index)
    total += len(index)
    ts.append(data)

size = os.path.getsize(path + 'ts_sec.h5') / 1000000
duration = (time() - t0)

print "\ndata written to disk in mega bytes        %d" % size
print "time to write all data in seconds         %5.3f" % (time() - t0)

print "\nwriting speed in mega bytes per second    %.1f" % int(size / duration)
print "writing speed in records per second       %d" % int(total / duration)


# The strength of TsTables lies in retrieving chunks of time series
# data defined by a start date and an end date (which obviously is a
# typical case in finance, e.g. in backtesting strategies or risk
# management).

t0 = time()

read_start_dt = datetime(2015, 2, 7, 0, 0)  # one day
read_end_dt = datetime(2015, 2, 7, 23, 59)  # worth of data

# TsTables tries to make such an operation as fast as possible.
# Conveniently, the returned object is a pandas DataFrame.

rows = ts.read_range(read_start_dt, read_end_dt)

print "\ntime to retrieve data slice in seconds    %4.3f" % (time()- t0)


print "number of rows in results object          %d" % rows.count()[0]

print "\nthe first 5 rows of this object:"

print np.round(rows.head(), 2).to_string()

# The following simulates a randomized access where
# chunks of daily data sets are accessed and retrieved
# as pandas DataFrame objects.

t0 = time()
its = 250
total = 0
for _ in xrange(its):
    day = random.randint(1, 27)
    month = random.randint(1, 11)
    read_start_dt = datetime(2015, month, day, 0, 0)
    read_end_dt = datetime(2015, month, day + 1, 0, 0)
    rows = ts.read_range(read_start_dt, read_end_dt)
    total += rows.count()[0]
t1 = time()

duration = t1 - t0
print "\ntime for %d random accesses       %5.3f seconds" % (its, duration)
print "average time for random access     %5.3f seconds" % (duration / its)
print "number of records per second       %d" % (total / duration)


# close database and remove sample data file

h5.close()
os.remove(path + 'ts_sec.h5')
	#
	# TsTables -- High Frequency Times Series Data with PyTables
	#
	# to install -- pip install tstables
	# Github repo -- https://github.com/afiedler/tstables
	#
	# Dr. Yves J. Hilpisch
	#
	# The Python Quants GmbH
	# http://quant-platform.com
	# http://pythonquants.com
	#
	import os
	import numpy as np
	import pandas as pd
	import tables as tb
	import random
	from time import time
	import tstables
	from datetime import datetime

	## Preliminaries

	# First, some basic assumptions.

	path = '/notebooks/ssd/data/' # delete or adjust

	co = 10 # number of time series
	dt = 1. / (12 * 30 * 24 * 60) # second as year fraction
	vol = 0.2 # volatility for stochastic processes


	# Second, a function to generate paths based on geometric Brownian motion.

	def generate_paths(no, initials):
	paths = initials * np.exp(np.cumsum(-0.5 * vol ** 2 * dt +
	vol * np.sqrt(dt) * np.random.standard_normal((no, co)), axis=0))
	paths[0] = initials
	return paths


	## Sample Data and Storage

	# To store the time series data in a PyTables table we define
	# the table structure.

	class TS(tb.IsDescription):
	timestamp = tb.Int64Col(pos=0)
	ts1 = tb.Float64Col(pos=1)
	ts2 = tb.Float64Col(pos=2)
	ts3 = tb.Float64Col(pos=3)
	ts4 = tb.Float64Col(pos=4)
	ts5 = tb.Float64Col(pos=5)
	ts6 = tb.Float64Col(pos=6)
	ts7 = tb.Float64Col(pos=7)
	ts8 = tb.Float64Col(pos=8)
	ts9 = tb.Float64Col(pos=9)
	ts10 = tb.Float64Col(pos=10)


	# Then open a database file and create the table object.

	h5 = tb.open_file(path + 'ts_sec.h5', 'w')


	# TsTables adds a new function ``create_ts`` to PyTables.

	ts = h5.create_ts('/', 'TS', TS)


	# We can now generate and write sample data to the table object.

	t0 = time()
	start = datetime(2015, 1, 1, 0, 0)
	initials = np.array([100.] * co)
	total = 0
	for i in xrange(18):
	if start.month == 12:
	end = datetime(start.year + 1, 1, 1)
	else:
	end = datetime(start.year, start.month + 1, 1)
	index = pd.date_range(start, end, freq='1s')
	start = end
	data = pd.DataFrame(generate_paths(len(index), initials),
	index=index)
	initials = data.values[-1]
	print i, len(index)
	total += len(index)
	ts.append(data)

	size = os.path.getsize(path + 'ts_sec.h5') / 1000000
	duration = (time() - t0)

	print "\ndata written to disk in mega bytes %d" % size
	print "time to write all data in seconds %5.3f" % (time() - t0)

	print "\nwriting speed in mega bytes per second %.1f" % int(size / duration)
	print "writing speed in records per second %d" % int(total / duration)


	# The strength of TsTables lies in retrieving chunks of time series
	# data defined by a start date and an end date (which obviously is a
	# typical case in finance, e.g. in backtesting strategies or risk
	# management).

	t0 = time()

	read_start_dt = datetime(2015, 2, 7, 0, 0) # one day
	read_end_dt = datetime(2015, 2, 7, 23, 59) # worth of data

	# TsTables tries to make such an operation as fast as possible.
	# Conveniently, the returned object is a pandas DataFrame.

	rows = ts.read_range(read_start_dt, read_end_dt)

	print "\ntime to retrieve data slice in seconds %4.3f" % (time()- t0)


	print "number of rows in results object %d" % rows.count()[0]

	print "\nthe first 5 rows of this object:"

	print np.round(rows.head(), 2).to_string()

	# The following simulates a randomized access where
	# chunks of daily data sets are accessed and retrieved
	# as pandas DataFrame objects.

	t0 = time()
	its = 250
	total = 0
	for _ in xrange(its):
	day = random.randint(1, 27)
	month = random.randint(1, 11)
	read_start_dt = datetime(2015, month, day, 0, 0)
	read_end_dt = datetime(2015, month, day + 1, 0, 0)
	rows = ts.read_range(read_start_dt, read_end_dt)
	total += rows.count()[0]
	t1 = time()

	duration = t1 - t0
	print "\ntime for %d random accesses %5.3f seconds" % (its, duration)
	print "average time for random access %5.3f seconds" % (duration / its)
	print "number of records per second %d" % (total / duration)


	# close database and remove sample data file

	h5.close()
	os.remove(path + 'ts_sec.h5')