Created
December 10, 2014 00:07
-
-
Save jonchang/72c993dbf6063349ef92 to your computer and use it in GitHub Desktop.
select only certain columns from a tab delimited file (optionally gzipped)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
import gzip | |
import argparse | |
from operator import itemgetter | |
def get_args(): | |
parser = argparse.ArgumentParser(description="selects fields from a tab-delimited file") | |
parser.add_argument("input", help="filename of a tab-delimited input (.gz OK)") | |
parser.add_argument("output", help="filename of the output") | |
parser.add_argument("columns", nargs="+", help="names of columns to keep") | |
return parser.parse_args() | |
def reader(fn, cols): | |
if fn.endswith(".gz"): | |
from gzip import open | |
getter = None | |
with open(fn, "r") as rfile: | |
for idx, line in enumerate(rfile): | |
splat = line.strip().split("\t") | |
if idx == 0: | |
wanted = [splat.index(x) for x in cols if x in splat] | |
getter = itemgetter(*wanted) | |
yield getter(splat) | |
def main(): | |
args = get_args() | |
with open(args.output, "w") as wfile: | |
for cols in reader(args.input, args.columns): | |
print(*cols, sep="\t", file=wfile) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment