Created
July 11, 2019 15:32
-
-
Save kkew3/4c83feb1099a52e9c17243a580ea3152 to your computer and use it in GitHub Desktop.
Small utility to select columns by name (assuming the first row contains titles) from CSV file without ambiguity (`csvcut` from `csvkit` has ambiguity currently)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import logging | |
import sys | |
def make_parser(): | |
parser = argparse.ArgumentParser( | |
description='Select column(s) of CSV file by name assuming the first ' | |
'row of the CSV lists the column names. Currently the ' | |
'script does not support quoting.') | |
parser.add_argument('-d', '--delimiter', default=',', | |
help='field delimiter, default to comma') | |
parser.add_argument('-F', '--no-print-field', action='store_false', | |
dest='print_field', | |
help='to print the selected title') | |
parser.add_argument('-S', '--strict', action='store_true', | |
help='raise error if one of FIELDs does not exist') | |
parser.add_argument('fields', nargs='*', metavar='FIELD', | |
help='FIELD to select') | |
return parser | |
def _main(): | |
args = make_parser().parse_args() | |
logging.basicConfig(format='%(filename)s: %(levelname)s: %(message)s', | |
level=logging.DEBUG) | |
try: | |
infile = sys.stdin | |
titles = next(infile).rstrip('\n').split(args.delimiter) | |
indices = [] | |
for i, x in enumerate(args.fields): | |
try: | |
j = titles.index(x) | |
except ValueError: | |
if args.strict: | |
logging.error('Field "%s" does not exist; aborted', | |
args.fields[i]) | |
return 4 | |
logging.warning('Field %s does not exist', args.fields[i]) | |
else: | |
indices.append(j) | |
if args.print_field: | |
print(args.delimiter.join(titles[j] for j in indices)) | |
if args.fields: | |
for line in infile: | |
tokens = line.rstrip('\n').split(args.delimiter) | |
try: | |
filtered_line = [tokens[j] for j in indices] | |
except IndexError: | |
filtered_line = [] | |
for jj in indices: | |
try: | |
filtered_lin.append(tokens[jj]) | |
except IndexError: | |
pass | |
if filtered_line: | |
print(args.delimiter.join(filtered_line)) | |
except KeyboardInterrupt: | |
return 130 | |
except BrokenPipeError: | |
sys.stderr.close() | |
else: | |
return 0 | |
if __name__ == '__main__': | |
sys.exit(_main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment