Created
May 1, 2015 23:26
-
-
Save davclark/81d7cc370126c7aa0ddc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from zipfile import ZipFile\n", | |
"from datetime import datetime\n", | |
"\n", | |
"from pytz import timezone\n", | |
"import numpy as np\n", | |
"from numpy.lib import recfunctions\n", | |
"import tables as tb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fname = '../local_data/EQY_US_ALL_BBO_20140206.zip'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"```\n", | |
"fields of dailyquotes file taqquote\n", | |
"[0:8]HHMMSSXXX\n", | |
"[9] text EXCHANGE N Nyse T/Q NASDAQ\n", | |
"[10:25] text symbol 6+10\n", | |
"[26:36] bid price 7+4\n", | |
"[37:43] bid size (units)\n", | |
"[44:54] ask price 7+4\n", | |
"[55:61] ask size\n", | |
"[62] text Condition of quote\n", | |
"[63:66] market maker\n", | |
"[67] bid exchange\n", | |
"[68] ask aexchange\n", | |
"[69:84] int seqno\n", | |
"[85] int bbo indicator\n", | |
"[86] int NASDAQ BBO indocator\n", | |
"[87] text cancel/correction\n", | |
"[88] text C=CTA N=UTP\n", | |
"[90] text Retail interest indicator\n", | |
"[...]\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Read this in" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"```\n", | |
"# Two characters are also used at the end of each line as a line indicator\n", | |
"widths = [9, 1, 16, 11, 7, 11, 7, 1, 4, 1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n", | |
"\n", | |
"w = np.array(widths)\n", | |
"w.cumsum()\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Note that we'll have 98 bytes total with the `\\r\\n` on the end." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"BYTES_PER_LINE = 98" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": false | |
}, | |
"source": [ | |
"```\n", | |
"# Note - we're using object here (as pandas will do this anyway), \n", | |
"# and we'll need to convert back to fixed width strings later\n", | |
"# We can get the widths from the widths list above\n", | |
"old_dtype = [('Time', np.datetime64),\n", | |
" ('Exchange', object), # |S1\n", | |
" ('Symbol', object), # |S16, etc.\n", | |
" ('Bid_Price', np.float64),\n", | |
" ('Bid_Size', np.int32),\n", | |
" ('Ask_Price', np.float64),\n", | |
" ('Ask_Size', np.int32),\n", | |
" ('Quote_Condition', object),\n", | |
" ('Market_Maker', np.int), # This is blank - want to skip?\n", | |
" ('Bid_Exchange', object),\n", | |
" ('Ask_Exchange', object),\n", | |
" ('Sequence_Number', np.int64),\n", | |
" ('National_BBO_Ind', np.int8), # These aren't really numbers\n", | |
" ('National_BBO_Ind', np.int8), # Maybe should be string?\n", | |
" ('Quote_Cancel_Correction', object),\n", | |
" ('Source_of_Quote', object),\n", | |
" ('Retail_Interest_Indicator_RPI', object),\n", | |
" ('Short_Sale_Restriction_Indicator', object),\n", | |
" ('LULD_BBO_Indicator_CQS', object),\n", | |
" ('LULD_BBO_Indicator_UTP', object),\n", | |
" ('FINRA_ADF_MPID_Indicator', object),\n", | |
" ('SIP_generated_Message_Identifier', object),\n", | |
" ('National_BBO_LULD_Indicator', object)\n", | |
" ] # Then there's two characters for newline\n", | |
"\n", | |
"# This was for pandas' screwball approach to dtype\n", | |
"# names = [a for a,b in dtype]\n", | |
"# dtype = dict(dtype)\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Note that the '|' character means byte order doesn't matter, \n", | |
"# which it never will for \"bytes\" (which is what 'S' stands for)\n", | |
"initial_dtype = [('Time', 'S9'), # HHMMSSmmm, should be in Eastern Time (ET)\n", | |
" # ('hour', '|S2'),\n", | |
" # ('minute', '|S2'),\n", | |
" # ('second', '|S2'),\n", | |
" # ('msec', '|S3'),\n", | |
" ('Exchange', 'S1'),\n", | |
" ('Symbol', 'S16'),\n", | |
" ('Bid_Price', 'S11'), # 7.4 (fixed point)\n", | |
" ('Bid_Size', 'S7'),\n", | |
" ('Ask_Price', 'S11'), # 7.4\n", | |
" ('Ask_Size', 'S7'),\n", | |
" ('Quote_Condition', 'S1'),\n", | |
" ('Market_Maker', 'S4'), # This ends up getting discarded, it should always be b' '\n", | |
" ('Bid_Exchange', 'S1'),\n", | |
" ('Ask_Exchange', 'S1'),\n", | |
" ('Sequence_Number', 'S16'),\n", | |
" ('National_BBO_Ind', 'S1'),\n", | |
" ('NASDAQ_BBO_Ind', 'S1'),\n", | |
" ('Quote_Cancel_Correction', 'S1'),\n", | |
" ('Source_of_Quote', 'S1'),\n", | |
" ('Retail_Interest_Indicator_RPI', 'S1'),\n", | |
" ('Short_Sale_Restriction_Indicator', 'S1'),\n", | |
" ('LULD_BBO_Indicator_CQS', 'S1'),\n", | |
" ('LULD_BBO_Indicator_UTP', 'S1'),\n", | |
" ('FINRA_ADF_MPID_Indicator', 'S1'),\n", | |
" ('SIP_generated_Message_Identifier', 'S1'),\n", | |
" ('National_BBO_LULD_Indicator', 'S1'),\n", | |
" ('newline', 'S2')]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Justin and Pandas (I think) use time64, as does PyTables.\n", | |
"# We could use msec from beginning of day for now in an int16\n", | |
"# (maybe compare performance to datetime64? But dates should compress very well...)\n", | |
"time_col = 'Time'\n", | |
"\n", | |
"convert_dtype = [\n", | |
" ('Bid_Price', np.float64),\n", | |
" ('Bid_Size', np.int32),\n", | |
" ('Ask_Price', np.float64),\n", | |
" ('Ask_Size', np.int32),\n", | |
" # ('Market_Maker', np.int8), # This is not currently used, and should always be b' '\n", | |
" ('Sequence_Number', np.int64),\n", | |
" # ('National_BBO_Ind', np.int8), # The _Ind fields are actually categorical - leaving as strings\n", | |
" # ('NASDAQ_BBO_Ind', np.int8),\n", | |
" ]\n", | |
"\n", | |
"passthrough_strings = ['Exchange',\n", | |
" 'Symbol',\n", | |
" 'Quote_Condition',\n", | |
" 'Bid_Exchange',\n", | |
" 'Ask_Exchange',\n", | |
" 'National_BBO_Ind', # The _Ind fields are actually categorical - leaving as strings\n", | |
" 'NASDAQ_BBO_Ind',\n", | |
" 'Quote_Cancel_Correction',\n", | |
" 'Source_of_Quote',\n", | |
" 'Retail_Interest_Indicator_RPI',\n", | |
" 'Short_Sale_Restriction_Indicator',\n", | |
" 'LULD_BBO_Indicator_CQS',\n", | |
" 'LULD_BBO_Indicator_UTP',\n", | |
" 'FINRA_ADF_MPID_Indicator',\n", | |
" 'SIP_generated_Message_Identifier',\n", | |
" 'National_BBO_LULD_Indicator']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Lifted from blaze.pytables\n", | |
"def dtype_to_pytables(dtype):\n", | |
" \"\"\" Convert NumPy dtype to PyTable descriptor\n", | |
" Examples\n", | |
" --------\n", | |
" >>> from tables import Int32Col, StringCol, Time64Col\n", | |
" >>> dt = np.dtype([('name', 'S7'), ('amount', 'i4'), ('time', 'M8[us]')])\n", | |
" >>> dtype_to_pytables(dt) # doctest: +SKIP\n", | |
" {'amount': Int32Col(shape=(), dflt=0, pos=1),\n", | |
" 'name': StringCol(itemsize=7, shape=(), dflt='', pos=0),\n", | |
" 'time': Time64Col(shape=(), dflt=0.0, pos=2)}\n", | |
" \"\"\"\n", | |
" d = {}\n", | |
" for pos, name in enumerate(dtype.names):\n", | |
" dt, _ = dtype.fields[name]\n", | |
" if issubclass(dt.type, np.datetime64):\n", | |
" tdtype = tb.Description({name: tb.Time64Col(pos=pos)}),\n", | |
" else:\n", | |
" tdtype = tb.descr_from_dtype(np.dtype([(name, dt)]))\n", | |
" el = tdtype[0] # removed dependency on toolz -DJC\n", | |
" getattr(el, name)._v_pos = pos\n", | |
" d.update(el._v_colobjects)\n", | |
" return d" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# The \"easy\" dtypes are the \"not datetime\" dtypes\n", | |
"easy_dtype = []\n", | |
"convert_dict = dict(convert_dtype)\n", | |
"\n", | |
"for name, dtype in initial_dtype:\n", | |
" if name in convert_dict:\n", | |
" easy_dtype.append( (name, convert_dict[name]) )\n", | |
" elif name in passthrough_strings:\n", | |
" easy_dtype.append( (name, dtype) )\n", | |
"\n", | |
"# PyTables will not accept np.datetime64, we hack below, but we use it to work with the blaze\n", | |
"# function above.\n", | |
"# We also shift Time to the end (while I'd rather maintain order), as it's more efficient for Dav\n", | |
"# given the technical debt he's already built up.\n", | |
"pytables_dtype = easy_dtype + [('Time', 'datetime64[ms]')]\n", | |
"\n", | |
"pytables_desc = dtype_to_pytables(\n", | |
" np.dtype(pytables_dtype) )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"class TAQ2HDF5:\n", | |
" \n", | |
" def __init__(self, taq_fname):\n", | |
" self.taq_fname = taq_fname\n", | |
" \n", | |
" def convert_taq(self):\n", | |
" # The below doesn't work for pandas (and neither does `unzip` from the command line). Probably want to use\n", | |
" # something like `7z x -so my_file.zip 2> /dev/null` if we use pandas.\n", | |
" with ZipFile(self.taq_fname) as zfile:\n", | |
" for inside_f in zfile.filelist:\n", | |
" # The original filename is available as inside_f.filename\n", | |
" with zfile.open(inside_f.filename) as infile:\n", | |
" first = infile.readline()\n", | |
"\n", | |
" # You need to use bytes to split bytes\n", | |
" dateish, numlines = first.split(b\":\")\n", | |
" numlines = int(numlines)\n", | |
" \n", | |
" # Get dates to combine with times later\n", | |
" # This is a little over-trusting of the spec...\n", | |
" self.month = int(dateish[2:4])\n", | |
" self.day = int(dateish[4:6])\n", | |
" self.year = int(dateish[6:10])\n", | |
"\n", | |
" # Should I use a context manager here?\n", | |
" h5_table = self.setup_hdf5(inside_f.filename, numlines)\n", | |
" try:\n", | |
" self.raw_conversion(numlines, infile, h5_table)\n", | |
" finally:\n", | |
" self.finalize_hdf5()\n", | |
" \n", | |
" def setup_hdf5(self, h5_fname_root, numlines):\n", | |
" # We're using aggressive compression and checksums, since this will likely stick around\n", | |
" # Stopping one level short of max compression - don't be greedy.\n", | |
" self.h5 = tb.open_file(h5_fname_root + '.h5', title=h5_fname_root, mode='w', \n", | |
" filters=tb.Filters(complevel=8, complib='blosc:lz4hc', fletcher32=True) )\n", | |
" \n", | |
" return self.h5.create_table('/', 'daily_quotes', description=pytables_desc, expectedrows=numlines)\n", | |
" \n", | |
" \n", | |
" def finalize_hdf5(self):\n", | |
" self.h5.close()\n", | |
"\n", | |
" def process_chunk(self, all_strings):\n", | |
" # This is unnecessary copying\n", | |
" easy_converted = all_strings.astype(easy_dtype)\n", | |
" \n", | |
" # These don't have the decimal point in the TAQ file\n", | |
" for dollar_col in ['Bid_Price', 'Ask_Price']:\n", | |
" easy_converted[dollar_col] /= 10000\n", | |
" \n", | |
" # Currently, there doesn't seem to be any utility to converting to numpy.datetime64\n", | |
" # PyTables wants float64's corresponding to the POSIX Standar (relative to 1970-01-01, UTC)\n", | |
" converted_time = [datetime( self.year, self.month, self.day, \n", | |
" int(raw[:2]), int(raw[2:4]), int(raw[4:6]),\n", | |
" int(raw[6:9]) * 1000, # msec needs to be microsec \n", | |
" tzinfo=timezone('US/Eastern') ).timestamp()\n", | |
" for raw in all_strings['Time'] ]\n", | |
"\n", | |
" \n", | |
" # More unnecessary copying\n", | |
" records = recfunctions.append_fields(easy_converted, 'Time', converted_time, usemask=False)\n", | |
" \n", | |
" return records\n", | |
" \n", | |
" \n", | |
" # at some point, we might optimize chunksize. For now, assume PyTables is smart\n", | |
" def raw_conversion(self, numlines, infile, out, chunksize=None):\n", | |
" '''Read raw bytes from TAQ, write to HDF5'''\n", | |
" if chunksize is None:\n", | |
" chunksize = out.chunkshape[0]\n", | |
" \n", | |
" while(True):\n", | |
" raw_bytes = infile.read(BYTES_PER_LINE * chunksize)\n", | |
" if raw_bytes == '':\n", | |
" break\n", | |
" # If we use asarray with this dtype, it crashes Python! (might not be true anymore)\n", | |
" # ndarray gives 'S' arrays instead of chararrays (as recarray does)\n", | |
" all_strings = np.ndarray(chunksize, buffer=raw_bytes, dtype=initial_dtype)\n", | |
"\n", | |
" # This approach doesn't work...\n", | |
" # out[chunk_start:chunk_stop, 1:] = all_strings[:,1:-1]\n", | |
" \n", | |
" out.append( self.process_chunk(all_strings) )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'075300081PA 000000000000000000000007294000000027R PP000000000007625512 C \\r\\n075300085PA 000000000000000000000006076000000010R PP000000000007625612 C \\r\\n'" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"two_lines" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Let's process our file" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_run = TAQ2HDF5(fname)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"%time test_run.convert_taq()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 248, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"h5 = tb.open_file('./taqquote20140206.h5')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 250, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ (b'P', b'A ', 0.0, 0, 0.0, 0, b'R', b'P', b'P', 14, b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', 1391676960.901),\n", | |
" (b'P', b'A ', 0.0, 0, 72.94, 27, b'R', b'P', b'P', 76255, b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', 1391690940.081),\n", | |
" (b'P', b'A ', 0.0, 0, 60.76, 10, b'R', b'P', b'P', 76256, b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', 1391690940.085),\n", | |
" ...,\n", | |
" (b'K', b'A ', 57.79, 1, 57.92, 1, b'R', b'K', b'K', 2631333, b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', 1391698830.251),\n", | |
" (b'Z', b'A ', 57.82, 3, 57.84, 1, b'R', b'Z', b'Z', 2632737, b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', 1391698831.256),\n", | |
" (b'N', b'A ', 57.82, 4, 57.84, 3, b'R', b'N', b'N', 2632938, b'A', b'C', b'B', b' ', b' ', b' ', b' ', b' ', b'A', 1391698831.333)], \n", | |
" dtype=[('Exchange', 'S1'), ('Symbol', 'S16'), ('Bid_Price', '<f8'), ('Bid_Size', '<i4'), ('Ask_Price', '<f8'), ('Ask_Size', '<i4'), ('Quote_Condition', 'S1'), ('Bid_Exchange', 'S1'), ('Ask_Exchange', 'S1'), ('Sequence_Number', '<i8'), ('Quote_Cancel_Correction', 'S1'), ('Source_of_Quote', 'S1'), ('Retail_Interest_Indicator_RPI', 'S1'), ('Short_Sale_Restriction_Indicator', 'S1'), ('LULD_BBO_Indicator_CQS', 'S1'), ('LULD_BBO_Indicator_UTP', 'S1'), ('FINRA_ADF_MPID_Indicator', 'S1'), ('SIP_generated_Message_Identifier', 'S1'), ('National_BBO_LULD_Indicator', 'S1'), ('Time', '<f8')])" | |
] | |
}, | |
"execution_count": 250, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"h5.root.daily_quotes[:]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 232, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"h5.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment