Skip to content

Instantly share code, notes, and snippets.

@guilespi
Created October 29, 2012 02:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save guilespi/3971102 to your computer and use it in GitHub Desktop.
Save guilespi/3971102 to your computer and use it in GitHub Desktop.
QSTK Reading stock info from disk
def get_data_hardread(self, ts_list, symbol_list, data_item, verbose=False, bIncDelist=False):
'''
Read data into a DataFrame no matter what.
@param ts_list: List of timestamps for which the data values are needed. Timestamps must be sorted.
@param symbol_list: The list of symbols for which the data values are needed
@param data_item: The data_item needed. Like open, close, volume etc. May be a list, in which case a list of DataFrame is returned.
@param bIncDelist: If true, delisted securities will be included.
@note: If a symbol is not found then a message is printed. All the values in the column for that stock will be NaN. Execution then
continues as usual. No errors are raised at the moment.
'''
''' Now support lists of items, still support old string behaviour '''
bStr = False
if( isinstance( data_item, str) ):
data_item = [data_item]
bStr = True
# init data struct - list of arrays, each member is an array corresponding do a different data type
# arrays contain n rows for the timestamps and m columns for each stock
all_stocks_data = []
for i in range( len(data_item) ):
all_stocks_data.append( np.zeros ((len(ts_list), len(symbol_list))) );
all_stocks_data[i][:][:] = np.NAN
list_index= []
''' For each item in the list, add to list_index (later used to delete non-used items) '''
for sItem in data_item:
if( self.source == DataSource.CUSTOM ) :
''' If custom just load what you can '''
if (sItem == DataItem.CLOSE):
list_index.append(1)
elif (sItem == DataItem.ACTUAL_CLOSE):
list_index.append(2)
if( self.source == DataSource.COMPUSTAT ):
''' If compustat, look through list of features '''
for i, sLabel in enumerate(DataItem.COMPUSTAT):
if sItem == sLabel:
''' First item is date index, labels start at 1 index '''
list_index.append(i+1)
break
else:
raise ValueError ("Incorrect value for data_item %s"%sItem)
if( self.source == DataSource.NORGATE ):
if (sItem == DataItem.OPEN):
list_index.append(1)
elif (sItem == DataItem.HIGH):
list_index.append (2)
elif (sItem ==DataItem.LOW):
list_index.append(3)
elif (sItem == DataItem.CLOSE):
list_index.append(4)
elif(sItem == DataItem.VOL):
list_index.append(5)
elif (sItem == DataItem.ACTUAL_CLOSE):
list_index.append(6)
else:
#incorrect value
raise ValueError ("Incorrect value for data_item %s"%sItem)
if( self.source == DataSource.YAHOOold ):
if (sItem == DataItem.OPEN):
list_index.append(1)
elif (sItem == DataItem.HIGH):
list_index.append (2)
elif (sItem ==DataItem.LOW):
list_index.append(3)
elif (sItem == DataItem.CLOSE):
list_index.append(4)
elif(sItem == DataItem.VOL):
list_index.append(5)
elif (sItem == DataItem.ACTUAL_CLOSE):
list_index.append(6)
else:
#incorrect value
raise ValueError ("Incorrect value for data_item %s"%sItem)
if( self.source == DataSource.MLT or self.source == DataSource.YAHOO):
if (sItem == DataItem.OPEN):
list_index.append(1)
elif (sItem == DataItem.HIGH):
list_index.append (2)
elif (sItem ==DataItem.LOW):
list_index.append(3)
elif (sItem == DataItem.ACTUAL_CLOSE):
list_index.append(4)
elif(sItem == DataItem.VOL):
list_index.append(5)
elif (sItem == DataItem.CLOSE):
list_index.append(6)
else:
#incorrect value
raise ValueError ("Incorrect value for data_item %s"%sItem)
#end elif
#end data_item loop
#read in data for a stock
symbol_ctr=-1
for symbol in symbol_list:
_file = None
symbol_ctr = symbol_ctr + 1
#print self.getPathOfFile(symbol)
try:
if (self.source == DataSource.CUSTOM) or (self.source == DataSource.MLT)or (self.source == DataSource.YAHOO):
file_path= self.getPathOfCSVFile(symbol);
else:
file_path= self.getPathOfFile(symbol);
''' Get list of other files if we also want to include delisted '''
if bIncDelist:
lsDelPaths = self.getPathOfFile( symbol, True )
if file_path == None and len(lsDelPaths) > 0:
print 'Found delisted paths:', lsDelPaths
''' If we don't have a file path continue... unless we have delisted paths '''
if (type (file_path) != type ("random string")):
if bIncDelist == False or len(lsDelPaths) == 0:
continue; #File not found
if not file_path == None:
_file = open(file_path, "rb")
except IOError:
# If unable to read then continue. The value for this stock will be nan
print _file
continue;
assert( not _file == None or bIncDelist == True )
''' Open the file only if we have a valid name, otherwise we need delisted data '''
if _file != None:
if (self.source==DataSource.CUSTOM) or (self.source==DataSource.YAHOO)or (self.source==DataSource.MLT):
creader = csv.reader(_file)
row=creader.next()
row=creader.next()
#row.pop(0)
for i, item in enumerate(row):
if i==0:
try:
date = dt.datetime.strptime(item, '%Y-%m-%d')
date = date.strftime('%Y%m%d')
row[i] = float(date)
except:
date = dt.datetime.strptime(item, '%m/%d/%y')
date = date.strftime('%Y%m%d')
row[i] = float(date)
else:
row[i]=float(item)
naData=np.array(row)
for row in creader:
for i, item in enumerate(row):
if i==0:
try:
date = dt.datetime.strptime(item, '%Y-%m-%d')
date = date.strftime('%Y%m%d')
row[i] = float(date)
except:
date = dt.datetime.strptime(item, '%m/%d/%y')
date = date.strftime('%Y%m%d')
row[i] = float(date)
else:
row[i]=float(item)
naData=np.vstack([np.array(row),naData])
else:
naData = pkl.load (_file)
_file.close()
else:
naData = None
''' If we have delisted data, prepend to the current data '''
if bIncDelist == True and len(lsDelPaths) > 0 and naData == None:
for sFile in lsDelPaths[-1:]:
''' Changed to only use NEWEST data since sometimes there is overlap (JAVA) '''
inFile = open( sFile, "rb" )
naPrepend = pkl.load( inFile )
inFile.close()
if naData == None:
naData = naPrepend
else:
naData = np.vstack( (naPrepend, naData) )
#now remove all the columns except the timestamps and one data column
if verbose:
print self.getPathOfFile(symbol)
''' Fix 1 row case by reshaping '''
if( naData.ndim == 1 ):
naData = naData.reshape(1,-1)
#print naData
#print list_index
''' We open the file once, for each data item we need, fill out the array in all_stocks_data '''
for lLabelNum, lLabelIndex in enumerate(list_index):
ts_ctr = 0
b_skip = True
''' select timestamps and the data column we want '''
temp_np = naData[:,(0,lLabelIndex)]
#print temp_np
num_rows= temp_np.shape[0]
symbol_ts_list = range(num_rows) # preallocate
for i in range (0, num_rows):
timebase = temp_np[i][0]
timeyear = int(timebase/10000)
# Quick hack to skip most of the data
# Note if we skip ALL the data, we still need to calculate
# last time, so we know nothing is valid later in the code
if timeyear < ts_list[0].year and i != num_rows - 1:
continue
elif b_skip == True:
ts_ctr = i
b_skip = False
timemonth = int((timebase-timeyear*10000)/100)
timeday = int((timebase-timeyear*10000-timemonth*100))
timehour = 16
#The earliest time it can generate a time for is platform dependent
symbol_ts_list[i]=dt.datetime(timeyear,timemonth,timeday,timehour) # To make the time 1600 hrs on the day previous to this midnight
#for ends
#now we have only timestamps and one data column
#Skip data from file which is before the first timestamp in ts_list
while (ts_ctr < temp_np.shape[0]) and (symbol_ts_list[ts_ctr] < ts_list[0]):
ts_ctr= ts_ctr+1
#print "skipping initial data"
#while ends
for time_stamp in ts_list:
if (symbol_ts_list[-1] < time_stamp):
#The timestamp is after the last timestamp for which we have data. So we give up. Note that we don't have to fill in NaNs because that is
#the default value.
break;
else:
while ((ts_ctr < temp_np.shape[0]) and (symbol_ts_list[ts_ctr]< time_stamp)):
ts_ctr = ts_ctr+1
#while ends
#else ends
#print "at time_stamp: " + str(time_stamp) + " and symbol_ts " + str(symbol_ts_list[ts_ctr])
if (time_stamp == symbol_ts_list[ts_ctr]):
#Data is present for this timestamp. So add to numpy array.
#print " adding to numpy array"
if (temp_np.ndim > 1): #This if is needed because if a stock has data for 1 day only then the numpy array is 1-D rather than 2-D
all_stocks_data[lLabelNum][ts_list.index(time_stamp)][symbol_ctr] = temp_np [ts_ctr][1]
else:
all_stocks_data[lLabelNum][ts_list.index(time_stamp)][symbol_ctr] = temp_np [1]
#if ends
ts_ctr = ts_ctr +1
#inner for ends
#outer for ends
#print all_stocks_data
ldmReturn = [] # List of data matrixes to return
for naDataLabel in all_stocks_data:
ldmReturn.append( pa.DataFrame( naDataLabel, ts_list, symbol_list) )
''' Contine to support single return type as a non-list '''
if bStr:
return ldmReturn[0]
else:
return ldmReturn
#get_data_hardread ends
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment