Created
August 29, 2014 18:50
-
-
Save erikdubbelboer/f20ef392607203fb591e to your computer and use it in GitHub Desktop.
Sort a csv in python using limited memory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import csv | |
import json | |
import heapq | |
def sortCSV(infile, column, reverse, cast): | |
outfiles = [] | |
if reverse: | |
class Desc(list): | |
def __lt__(self, other): | |
return self[0] > other[0] | |
else: | |
class Desc(list): | |
def __lt__(self, other): | |
return self[0] < other[0] | |
with open(infile, 'r') as f: | |
r = csv.reader(f, delimiter=',') | |
while 1: | |
rows = [] | |
for row in r: | |
rows.append(row) | |
if len(rows) == 100000: | |
break | |
if len(rows) == 0: | |
break | |
rows = sorted(rows, key=lambda row: cast(row[column]), reverse=reverse) | |
t = os.tmpfile() | |
w = csv.writer(t, delimiter=',') | |
for row in rows: | |
w.writerow(row) | |
t.flush() | |
t.seek(0) | |
outfiles.append(csv.reader(t, delimiter=',')) | |
decorated = [(Desc([cast(line[column]), line]) for line in f) for f in outfiles] | |
merged = heapq.merge(*decorated) | |
for line in merged: | |
yield line[1] | |
return | |
if __name__ == '__main__': | |
# column 1 as int descending | |
s = sortCSV('test.csv', 1, True, int) | |
for row in s: | |
print row |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment