Skip to content

Instantly share code, notes, and snippets.

@inodb
Created January 17, 2017 19:40
Show Gist options
  • Save inodb/c1bbb05fb8e245e9eb60fe6f9e450937 to your computer and use it in GitHub Desktop.
Save inodb/c1bbb05fb8e245e9eb60fe6f9e450937 to your computer and use it in GitHub Desktop.
"""
Output rows from table1 that are missing from table2 i.e. table1 - table2, use
given columns to determine 'missing'
"""
import argparse
import pandas as pd
import sys
def output_missing_rows(t1, t2, join_columns, delimiter, output_only_columns):
df1 = pd.read_csv(t1, sep=delimiter)
df2 = pd.read_csv(t2, sep=delimiter)
df1i = df1.set_index(join_columns)
df2i = df2.set_index(join_columns)
if output_only_columns is None:
df1i.ix[pd.Index(set(df1i.index) - set(df2i.index)),
:].to_csv(sys.stdout, sep=delimiter)
else:
df1i.ix[pd.Index(set(df1i.index) - set(df2i.index)),
output_only_columns].to_csv(sys.stdout, sep=delimiter)
def main():
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("table1", type=str, help="TSV")
parser.add_argument("table2", type=str, help="Excel output")
parser.add_argument("column_names", type=str, nargs='+', help="Which columns to join on")
parser.add_argument("-c", "--output_only_columns", type=str, default=None, nargs='+', help="Which columns to output")
parser.add_argument("-d", "--delimiter", type=str, default="\t", help="Set delimiter")
args = parser.parse_args()
output_missing_rows(args.table1, args.table2, args.column_names, args.delimiter, args.output_only_columns)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment