Skip to content

Instantly share code, notes, and snippets.

@erikdubbelboer
Created August 29, 2014 18:50
Show Gist options
  • Save erikdubbelboer/f20ef392607203fb591e to your computer and use it in GitHub Desktop.
Save erikdubbelboer/f20ef392607203fb591e to your computer and use it in GitHub Desktop.
Sort a csv in python using limited memory
import os
import csv
import json
import heapq
def sortCSV(infile, column, reverse, cast):
outfiles = []
if reverse:
class Desc(list):
def __lt__(self, other):
return self[0] > other[0]
else:
class Desc(list):
def __lt__(self, other):
return self[0] < other[0]
with open(infile, 'r') as f:
r = csv.reader(f, delimiter=',')
while 1:
rows = []
for row in r:
rows.append(row)
if len(rows) == 100000:
break
if len(rows) == 0:
break
rows = sorted(rows, key=lambda row: cast(row[column]), reverse=reverse)
t = os.tmpfile()
w = csv.writer(t, delimiter=',')
for row in rows:
w.writerow(row)
t.flush()
t.seek(0)
outfiles.append(csv.reader(t, delimiter=','))
decorated = [(Desc([cast(line[column]), line]) for line in f) for f in outfiles]
merged = heapq.merge(*decorated)
for line in merged:
yield line[1]
return
if __name__ == '__main__':
# column 1 as int descending
s = sortCSV('test.csv', 1, True, int)
for row in s:
print row
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment