Skip to content

Instantly share code, notes, and snippets.

@tomhennigan
Created August 1, 2012 20:17
Show Gist options
  • Save tomhennigan/3230337 to your computer and use it in GitHub Desktop.
Save tomhennigan/3230337 to your computer and use it in GitHub Desktop.
Find always empty columns in .csv
a b c e
a c e
c e
e
#!/usr/bin/env python
# Prints any always empty columns in a CSV input file.
# usage: cat data.csv | ./mr --map | sort -k1n | ./mr --reduce
import sys
def map_main():
import csv
for row in csv.reader(sys.stdin):
i = 0
for field in row:
if field:
field = '.'
print str(i) + "\t" + field
i = i + 1
def reduce_main():
def mr_reduce(key, values):
empty_values = True
for value in values:
if value:
empty_values = False
break
if empty_values:
yield (key, None)
prev_key = None
values = []
while True:
line = sys.stdin.readline(1024)
if not line:
break
line = line.rstrip('\n')
i = 0
key, value = line.split("\t", 1)
if prev_key == None:
prev_key = key
values.append(value)
continue
if key == prev_key:
values.append(value)
continue
for k, v in mr_reduce(prev_key, values):
print "column " + str(k) + " is always empty"
prev_key = key
values = [value]
for k, v in mr_reduce(prev_key, values):
print "column " + str(k) + " is always empty"
assert len(sys.argv) == 2 and sys.argv[1] in ['--map', '--reduce'], "call with either --map or --reduce"
if sys.argv[1] == "--map":
map_main()
else:
reduce_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment