Skip to content

Instantly share code, notes, and snippets.

@charmoniumQ
Created April 28, 2017 05:15
Show Gist options
  • Save charmoniumQ/e0e907912be37ef4103b5957b2ecf95c to your computer and use it in GitHub Desktop.
Save charmoniumQ/e0e907912be37ef4103b5957b2ecf95c to your computer and use it in GitHub Desktop.
CSV-aware cut -- outputs chosen columns of it's input
#!/usr/bin/env python3
import click
import sys
import csv
version = 0.1
version_info = '''
csvcut v{version}
by Sam Grayson
(backwards-c) 2017 GPLv3
'''
@click.command()
@click.option('--fields', '-f', type=str,
help='select only these (numbered) fields in this order')
@click.option('--kfields', '-k', type=str,
help='select only these keyword fields in this order')
@click.option('--complement', default=False, type=bool,
help='inverts the selection')
# --help implicit option
@click.option('--version', 'version', flag_value=True, default=False, type=bool,
help='prints version and exits')
@click.argument('file', default=sys.stdin, type=click.File('r'))
def main(fields, kfields, complement, version, file):
'''CSV-aware cut -- outputs chosen columns of it's input
This script is a thin wrapper around Python's CSV (Comma-Separated Values)
library with a cut-like interface. It is capable of handling escaped commas and
newlines.
With no FILE, or when FILE is -, read standard input.
If neither keyword fields nor numbered fields are supplied, print the header.
For keyword fields with -k, the first row is interpreted as the header.
For numbered fields with -f, this accepts comma separated field-ranges of the
following form where N, and M are integers (like the cut syntax):
\b
N N'th field
N- from N'th field
N-M from N'th to M'th (included) field
-M from first to M'th (included) field
Fields out of the range of the row will extract the empty string. Use / in place
of a minus-sign to input negative number. Negative indices are interpreted as
counting from the end of the line.
'''
if version:
return print_version()
if fields and not kfields:
return print_fields(file, fields, complement)
if kfields and not fields:
return print_kfields(file, kfields, complement)
if kfields and fields:
raise RuntimeError('Cannot provide both numbered fields and keyword fields')
if not kfields and not fields:
return print_header(file)
def str_to_indices(fields):
'''returns a list of indices or a tuple representing the range [M:None]'''
slices = []
fields = fields.split(',')
for field in fields:
if '-' in field:
span = field.split('-')
span = tuple(map(lambda x: x.replace('/', '-'), span))
if len(span) != 2:
raise RuntimeError('Cannot parse {field}\nShould be M, M-, -M, or N-M'.format(**locals()))
if span[0] and span[1]:
slices.append((int(span[0]), int(span[1])))
elif span[0] and not span[1]:
slices.append((int(span[0]), None))
elif not span[0] and span[1]:
slices.append((0, int(span[1])))
else:
field = field.replace('/', '-')
slices.append(int(field))
return slices
def print_fields(file, fields, complement):
try:
indices = str_to_indices(fields)
except RuntimeError as e:
print(str(e))
return 1
return print_fields_(file, sys.stdout, indices, complement)
def print_fields_(infile, outfile, indices, complement):
csv_in = csv.reader(infile)
csv_out = csv.writer(outfile)
for line in csv_in:
indices = indices_to_positive_indices(indices, len(line))
if complement:
indices = complement(indices, len(line))
csv_out.writerow(project_arr(line, indices, ''))
return 0
def print_kfields(file, kfields, complement):
print_kfields_(file, sys.stdout, kfields.split(','), complement)
def print_kfields_(infile, outfile, fields, complement):
csv_in = csv.DictReader(infile)
csv_out = csv.DictWriter(outfile, fields)
if complement:
fields = [field for field in csv_in.fieldnames if field not in fields]
for line in csv_in:
csv_out.writerow(project(line, fields, ''))
return 0
def print_header(file):
csv_in = csv.DictReader(file)
print('\n'.join(csv_in.fieldnames))
return 0
def print_version():
print(version_info.format(**globals()).strip())
def indices_to_positive_indices(indices, length):
r = []
for index in indices:
if isinstance(index, tuple):
start, stop = index
if start < 0:
start += length
if stop is None:
stop = length - 1
if stop < 0:
stop += length
r.extend(range(start, stop + 1)) # inclusive, like unix cut
else:
if index < 0:
index += length
r.append(index)
return r
def complement(indices, length):
return [index for index in range(length) if index not in set(indices)]
def project(d, keys, default=None):
return {key: d[key] if key in d else default
for key in keys}
def project_arr(arr, indices, default=None):
return [arr[index] if 0 <= index < len(arr) else default
for index in indices]
def sniff_dialect(file):
dialect = csv.Sniffer().sniff(file.read(1024))
file.seek(0)
return dialect
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment