Skip to content

Instantly share code, notes, and snippets.

@benhoyt
Created February 22, 2017 20:49
Show Gist options
  • Save benhoyt/fae1d584435a5dcff65c8330fe368a78 to your computer and use it in GitHub Desktop.
Save benhoyt/fae1d584435a5dcff65c8330fe368a78 to your computer and use it in GitHub Desktop.
Efficient sliding-window sorting of time-series data in CSV file (in Python)
"""Efficient sliding-window sorting of time-series data in CSV file.
Demo for http://stackoverflow.com/a/42398981/68707
Tested on Python 3.5.
"""
import collections
import csv
import datetime
import random
def write_random_output(filename, num_lines):
with open(filename, 'w', newline='') as f:
writer = csv.writer(f)
timestamp = datetime.datetime(2016, 1, 1, 0, 0, 0)
for _ in range(num_lines):
asset_price = -(1000000 + random.randrange(1000000) / 10.0)
units_traded = 500000 + random.randrange(100000)
ticks = 4000 + random.randrange(1000)
if random.randrange(20) == 0:
t = timestamp - datetime.timedelta(microseconds=random.randrange(5000000))
else:
t = timestamp
row = [t.strftime('%Y-%m-%d %H:%M:%S.%f'), asset_price, units_traded, ticks]
writer.writerow(row)
timestamp = timestamp + datetime.timedelta(seconds=1)
def process_sliding(in_filename, out_filename, window_size=20):
with open(in_filename, newline='') as fin, open(out_filename, 'w', newline='') as fout:
reader = csv.reader(fin)
writer = csv.writer(fout)
first_window = sorted(next(reader) for _ in range(window_size))
window = collections.deque(first_window, maxlen=window_size)
for row in reader:
writer.writerow(window.popleft())
window.append(row)
if row[0] < window[-2][0]:
window = collections.deque(sorted(window), maxlen=window_size)
for row in window:
writer.writerow(row)
def process_sort(in_filename, out_filename):
with open(in_filename, newline='') as fin, open(out_filename, 'w', newline='') as fout:
reader = csv.reader(fin)
writer = csv.writer(fout)
for row in sorted(reader):
writer.writerow(row)
if __name__ == '__main__':
# write_random_output('big.csv', 10000000)
process_sliding('big.csv', 'sorted.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment