Skip to content

Instantly share code, notes, and snippets.

@nilesh-tawari
Last active March 13, 2018 03:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nilesh-tawari/d5c15fb6f12f6b76216f3b17febf41cd to your computer and use it in GitHub Desktop.
Save nilesh-tawari/d5c15fb6f12f6b76216f3b17febf41cd to your computer and use it in GitHub Desktop.
check_excel
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 13 09:40:36 2018
@author: rameshtn
"""
from __future__ import print_function
import os
import argparse
import pandas as pd
from pandas.util.testing import assert_frame_equal
# Parameters
parser = argparse.ArgumentParser(description = 'Compare the old and new excel ' \
'files generated by CET')
parser.add_argument('old', help='Input call old excel filename')
parser.add_argument('new', help='Input call new excel filename')
args = parser.parse_args()
old = os.path.abspath(args.old)
new = os.path.abspath(args.new)
# functions
def assertFrameEqual(df1, df2):
""" Assert that two dataframes are equal, ignoring ordering of columns"""
return assert_frame_equal(df1.sort_index(axis=1), df2.sort_index(axis=1), check_names=True)
def diff(first, second):
second = set(second)
return [item for item in first if item not in second]
# phenotypes should be exactly same
df_new_ph = pd.read_excel(new, sheetname = 'Phenotypes_details', header=0)
df_old_ph = pd.read_excel(old, sheetname = 'Phenotypes_details', header=0)
assertFrameEqual(df_new_ph, df_old_ph)
# gene coverage should be exactly same
df_new_ge = pd.read_excel(new, sheetname = 'Gene_wise_coverage', header=0)
df_old_ge = pd.read_excel(old, sheetname = 'Gene_wise_coverage', header=0)
assertFrameEqual(df_new_ge, df_old_ge)
# variants should be exactly same both in variants and transcripts
df_new_va = pd.read_excel(new, sheetname = 'Filtered_variants', header=0)
df_old_va = pd.read_excel(old, sheetname = 'Filtered_variants', header=0)
assert df_new_va['variant_ID'].equals(df_new_va['variant_ID'])
df_new_tr = pd.read_excel(new, sheetname = 'Transcript_details', header=0)
df_old_tr = pd.read_excel(old, sheetname = 'Transcript_details', header=0)
assert df_new_tr['variant_ID'].equals(df_new_tr['variant_ID'])
# check columns
new_col = list(df_new_va.columns)
old_col = list(df_old_va.columns)
# there should be no new column
new_cols = diff(new_col, old_col)
assert len(new_cols) == 0
missing_cols = diff(old_col, new_col)
# some cols should be missing
mis_cols = ['GM12878_confidence_value',
'GM12878_fitCons_score',
'GenoCanyon_score',
'H1-hESC_confidence_value',
'H1-hESC_fitCons_score',
'HUVEC_confidence_value',
'HUVEC_fitCons_score',
'VEST3_score',
'fathmm-MKL_coding_group',
'fathmm-MKL_coding_pred',
'fathmm-MKL_coding_score',
'integrated_confidence_value',
'integrated_fitCons_score',
'phastCons20way_mammalian',
'phyloP20way_mammalian']
assert len(diff(missing_cols, mis_cols)) == 0
# samples should be same
new_samples = set([col.split(':')[0] for col in new_col if ":" in col])
old_samples = set([col.split(':')[0] for col in old_col if ":" in col])
assert len(diff(new_samples, old_samples)) == 0
print("Passed equality check !!!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment