Skip to content

Instantly share code, notes, and snippets.

@yuchenlin yuchenlin/label_data.py
Last active May 23, 2017

Embed
What would you like to do?
Lable the price changes in a csv file
import os
import pandas as pd
STEP = 20
filenames = []
DBExport = '/home/bill/dm_proj/DBExport'
for filename in os.listdir(DBExport):
if filename.endswith(".csv"):
filenames.append(os.path.join(DBExport, filename))
filenames.sort()
def get_label(file_id):
global STEP # STEP=20 for 20 ticks
filename = filenames[file_id] # the filename of the csv file you want to process
print(filename)
df = pd.read_csv(filename)
labels = [] # a list of tuple (id,0/1)
count = len(df)
k = 1
for i in range(0, count - STEP, STEP):
# to keep efficiency, if STEP is smaller than the last k, we just set the same label
if STEP < k:
k -= STEP
labels.append((i, labels[-1][1]))
continue
cur_ask_bid = df.iloc[i]['AskPrice1'] + df.iloc[i]['BidPrice1']
next_ask_bid = df.iloc[i+STEP]['AskPrice1'] + df.iloc[i+STEP]['BidPrice1']
# if price(t) == price (t + STEP), we are going to check price(t+STEP+k), k starts with 1
k = 1
while next_ask_bid == cur_ask_bid:
if i+STEP+k >= count: # out of range
break
next_ask_bid = df.iloc[i+STEP+k]['AskPrice1'] + df.iloc[i+STEP+k]['BidPrice1']
k += 1
# here, we know [t+STEP,t+STEP+k] have the same price, so we can use this information to avoid wasteful computation
if cur_ask_bid < next_ask_bid:
labels.append((i, 1)) # 1 means future -> increase
elif cur_ask_bid > next_ask_bid:
labels.append((i, 0)) # 0 means future -> decrease
print(i)
# save to a txt file
f = open('%d_tick_label_%d.txt' % (STEP, file_id), 'w')
f.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in labels]))
f.close()
ind = 0 # for the first csv file
get_label(ind)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.