Skip to content

Instantly share code, notes, and snippets.

@kwarrick
Last active December 18, 2015 05:59
Show Gist options
  • Save kwarrick/5736376 to your computer and use it in GitHub Desktop.
Save kwarrick/5736376 to your computer and use it in GitHub Desktop.
Map-reduce a CSV file using UNIX sort utility in just 24 lines of code.
#!/usr/bin/env python
# kwarrick@uga.edu
import csv
import subprocess
from itertools import groupby
def identity(infile, outfile):
def key(row):
return row[0]
def mapper(row):
yield row
def reducer(key, values):
for value in values:
yield value
map_reduce_csv(mapper, reducer, key, infile, outfile)
def map_reduce_csv(mapper, reducer, key, infile, outfile):
""" Map-reduce CSV file using UNIX sort utility. """
sort = subprocess.Popen(
['/usr/bin/sort', '-t,'],
env={'LC_ALL': 'C'},
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
)
# map and sort
reader = csv.reader(infile)
writer = csv.writer(sort.stdin, quoting=csv.QUOTE_NONE)
for row in reader:
writer.writerows(mapper(row))
sort.stdin.close()
# group and reduce
reader = csv.reader(sort.stdout)
writer = csv.writer(outfile, quoting=csv.QUOTE_NONE)
for k, v in groupby(reader, key):
writer.writerows(reducer(k, list(v)))
sort.stdout.close()
if __name__ == '__main__':
import fileinput
with open('output.csv', 'w') as outfile:
identity(fileinput.input(), outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment