Skip to content

Instantly share code, notes, and snippets.

@davidwtbuxton
Created March 6, 2012 23:10
Show Gist options
  • Save davidwtbuxton/1989671 to your computer and use it in GitHub Desktop.
Save davidwtbuxton/1989671 to your computer and use it in GitHub Desktop.
# http://www.reddit.com/r/learnpython/comments/qkh43/new_to_python_searching_csv_files/
# http://stackoverflow.com/questions/9564322/loop-through-rows-of-one-csv-file-to-find-corresponding-data-in-another
# http://stackoverflow.com/questions/9577997/search-through-csv-from-specific-row-down
import csv
# Difference constants. Note these are floats, so don't expect perfect decimal
# mathematics.
DELTA_HI = 0.001
DELTA_LO = 0.0015
def main(filename1, filename2):
# Mapping of ID to date string
source_dts = dict(csv.reader(open(filename1, 'rU'), delimiter=','))
# Invert the mapping, date string to ID. Having date as key makes searching easy
source_dts = dict((v, k) for k, v in source_dts.items())
# Will hold details of matching first rows (i.e. before finding delta row)
matches = []
# Will hold details of first rows and deltas (i.e. final results)
deltas = []
for row in csv.reader(open(filename2, 'rU'), delimiter=','):
dt, bid, ask = row[3:]
# Calculate deltas. Need this for checking matches and for storing.
bid = float(bid)
hi = bid + DELTA_HI
lo = bid - DELTA_LO
# Check if we have a match from the first file. A match is when datetime is
# in the same minute.
key = dt[:16]
if key in source_dts:
# Store a 3-tuple of (high, low, rowdata)
data = hi, lo, ([dt, source_dts[key]] + row)
matches.append(data)
# Remove source entry so we don't match it again
del source_dts[key]
# Check if we have a match for a previous row. A match is when the bid is
# within a previous row's low / high.
for idx, (p_hi, p_lo, p_row) in enumerate(matches):
# This row's bid has exceeded the delta
if (hi > p_hi) or (lo < p_lo):
deltas.append((p_row, row))
# Remove from previous rows so we don't match again
del matches[idx]
break
# Deltas should have 2-tuples of row data. For each tuple, first is row data
# for the datetime specified in file1, second is earliest row data after first
# with a bid that exceeds the delta.
return deltas
if __name__ == "__main__":
import sys
f1, f2 = sys.argv[1:3]
results = main(f1, f2)
for a, b in results:
print a, b
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment