nilesh-tawari/compare_excel.py

## compare_excel.py
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 13 09:40:36 2018

@author: rameshtn
"""

from __future__ import print_function
import os
import argparse
import pandas as pd
from pandas.util.testing import assert_frame_equal

# Parameters
parser = argparse.ArgumentParser(description = 'Compare the old and new excel ' \
                                                'files generated by CET')
parser.add_argument('old', help='Input call old excel filename')
parser.add_argument('new', help='Input call new excel filename')

args = parser.parse_args()
old = os.path.abspath(args.old)
new = os.path.abspath(args.new)

# functions
def assertFrameEqual(df1, df2):
    """ Assert that two dataframes are equal, ignoring ordering of columns"""
    return assert_frame_equal(df1.sort_index(axis=1), df2.sort_index(axis=1), check_names=True)

def diff(first, second):
        second = set(second)
        return [item for item in first if item not in second]


# phenotypes should be exactly same
df_new_ph = pd.read_excel(new, sheetname = 'Phenotypes_details', header=0)
df_old_ph = pd.read_excel(old, sheetname = 'Phenotypes_details', header=0)
assertFrameEqual(df_new_ph, df_old_ph)

# gene coverage should be exactly same
df_new_ge = pd.read_excel(new, sheetname = 'Gene_wise_coverage', header=0)
df_old_ge = pd.read_excel(old, sheetname = 'Gene_wise_coverage', header=0)
assertFrameEqual(df_new_ge, df_old_ge)

# variants should be exactly same both in variants and transcripts
df_new_va = pd.read_excel(new, sheetname = 'Filtered_variants', header=0)
df_old_va = pd.read_excel(old, sheetname = 'Filtered_variants', header=0)
assert df_new_va['variant_ID'].equals(df_new_va['variant_ID'])

df_new_tr = pd.read_excel(new, sheetname = 'Transcript_details', header=0)
df_old_tr = pd.read_excel(old, sheetname = 'Transcript_details', header=0)
assert df_new_tr['variant_ID'].equals(df_new_tr['variant_ID'])

# check columns
new_col = list(df_new_va.columns)
old_col = list(df_old_va.columns)

# there should be no new column
new_cols = diff(new_col, old_col)
assert len(new_cols) == 0
missing_cols = diff(old_col, new_col)

# some cols should be missing
mis_cols = ['GM12878_confidence_value',
 'GM12878_fitCons_score',
 'GenoCanyon_score',
 'H1-hESC_confidence_value',
 'H1-hESC_fitCons_score',
 'HUVEC_confidence_value',
 'HUVEC_fitCons_score',
 'VEST3_score',
 'fathmm-MKL_coding_group',
 'fathmm-MKL_coding_pred',
 'fathmm-MKL_coding_score',
 'integrated_confidence_value',
 'integrated_fitCons_score',
 'phastCons20way_mammalian',
 'phyloP20way_mammalian']

assert len(diff(missing_cols, mis_cols)) == 0

# samples should be same
new_samples = set([col.split(':')[0] for col in new_col if ":" in col])
old_samples = set([col.split(':')[0] for col in old_col if ":" in col])
assert len(diff(new_samples, old_samples)) == 0

print("Passed equality check !!!")
	# -- coding: utf-8 --
	"""
	Created on Tue Mar 13 09:40:36 2018

	@author: rameshtn
	"""

	from __future__ import print_function
	import os
	import argparse
	import pandas as pd
	from pandas.util.testing import assert_frame_equal

	# Parameters
	parser = argparse.ArgumentParser(description = 'Compare the old and new excel ' \
	'files generated by CET')
	parser.add_argument('old', help='Input call old excel filename')
	parser.add_argument('new', help='Input call new excel filename')

	args = parser.parse_args()
	old = os.path.abspath(args.old)
	new = os.path.abspath(args.new)

	# functions
	def assertFrameEqual(df1, df2):
	""" Assert that two dataframes are equal, ignoring ordering of columns"""
	return assert_frame_equal(df1.sort_index(axis=1), df2.sort_index(axis=1), check_names=True)

	def diff(first, second):
	second = set(second)
	return [item for item in first if item not in second]


	# phenotypes should be exactly same
	df_new_ph = pd.read_excel(new, sheetname = 'Phenotypes_details', header=0)
	df_old_ph = pd.read_excel(old, sheetname = 'Phenotypes_details', header=0)
	assertFrameEqual(df_new_ph, df_old_ph)

	# gene coverage should be exactly same
	df_new_ge = pd.read_excel(new, sheetname = 'Gene_wise_coverage', header=0)
	df_old_ge = pd.read_excel(old, sheetname = 'Gene_wise_coverage', header=0)
	assertFrameEqual(df_new_ge, df_old_ge)

	# variants should be exactly same both in variants and transcripts
	df_new_va = pd.read_excel(new, sheetname = 'Filtered_variants', header=0)
	df_old_va = pd.read_excel(old, sheetname = 'Filtered_variants', header=0)
	assert df_new_va['variant_ID'].equals(df_new_va['variant_ID'])

	df_new_tr = pd.read_excel(new, sheetname = 'Transcript_details', header=0)
	df_old_tr = pd.read_excel(old, sheetname = 'Transcript_details', header=0)
	assert df_new_tr['variant_ID'].equals(df_new_tr['variant_ID'])

	# check columns
	new_col = list(df_new_va.columns)
	old_col = list(df_old_va.columns)

	# there should be no new column
	new_cols = diff(new_col, old_col)
	assert len(new_cols) == 0
	missing_cols = diff(old_col, new_col)

	# some cols should be missing
	mis_cols = ['GM12878_confidence_value',
	'GM12878_fitCons_score',
	'GenoCanyon_score',
	'H1-hESC_confidence_value',
	'H1-hESC_fitCons_score',
	'HUVEC_confidence_value',
	'HUVEC_fitCons_score',
	'VEST3_score',
	'fathmm-MKL_coding_group',
	'fathmm-MKL_coding_pred',
	'fathmm-MKL_coding_score',
	'integrated_confidence_value',
	'integrated_fitCons_score',
	'phastCons20way_mammalian',
	'phyloP20way_mammalian']

	assert len(diff(missing_cols, mis_cols)) == 0

	# samples should be same
	new_samples = set([col.split(':')[0] for col in new_col if ":" in col])
	old_samples = set([col.split(':')[0] for col in old_col if ":" in col])
	assert len(diff(new_samples, old_samples)) == 0

	print("Passed equality check !!!")