Created
May 2, 2015 12:14
-
-
Save pgorczak/6526b61ff58550c6b9ae to your computer and use it in GitHub Desktop.
Easy uniq for csv files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import argparse | |
import csv | |
import itertools as itt | |
import sys | |
parser = argparse.ArgumentParser(description= | |
'''Eliminate runs of identical values from csv data. Matching lines are | |
merged to the first occurrence.''') | |
parser.add_argument('name', type=str, nargs=1, | |
help='name of the column to be processed') | |
parser.add_argument('infile', help='input file', nargs=1, | |
type=argparse.FileType('r')) | |
parser.add_argument('outfile', help='output file', nargs='?', | |
type=argparse.FileType('w'), default=sys.stdout) | |
args = parser.parse_args() | |
reader = csv.reader(*args.infile) | |
names = reader.next() | |
name = args.name[0] | |
try: | |
index = names.index(name) | |
except ValueError: | |
not_found = KeyError('Specified name "{}" not found in the file (names are: {})'.format(name, ', '.join(names))) | |
raise not_found | |
# get key value from csv row | |
get_value = lambda e: e[index] | |
# group subsequent elements by value | |
grouped = itt.groupby(reader, get_value) | |
# get the first element of every group: | |
get_first = lambda key, group: next(group) | |
unique = itt.starmap(get_first, grouped) | |
writer = csv.writer(args.outfile) | |
# names in the first row of output csv | |
writer.writerow(names) | |
# work it | |
writer.writerows(unique) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment