johnbowes/positional_duplicates.py

## positional_duplicates.py
#!/usr/bin/python

import sys
import pandas as pd
import numpy as np

# arguments are fixed in the following format:
lmiss_file = sys.argv[1]
bim_file = sys.argv[2]
output_file = sys.argv[3]

# read .bim and .lmiss
bim_header = ['CHR','SNP','DIST','POSITION','A1','A2']
bim = pd.read_table(bim_file, header=None, names=bim_header)
bim.drop(['DIST','A1', 'A2'], axis=1, inplace=True)

lmiss = pd.read_table(lmiss_file, delim_whitespace=True)
lmiss.drop(['N_MISS','N_GENO'], axis=1, inplace=True)

# left join on lmiss and bim
data = pd.merge(lmiss, bim, how='left', on=['SNP', 'CHR'])

# get duplicates based on chr bp
#
# NOTE ON PROCESS:
# Some groups have more than 2 rows, using an idxmax method would
# only exclude one of the three. The duplicate with the least amount
# of missing data, of any sized group, is identiied. Then filtered
# out of duplicate dataframe.
duplicates = data.groupby(['CHR','POSITION']).filter(lambda x: len(x) > 1)
keep = duplicates.loc[duplicates.groupby(['CHR','POSITION'])['F_MISS'].idxmin(axis=1)]
fails = duplicates[~duplicates['SNP'].isin(keep['SNP'])]

# write list of fails
fails.to_csv(output_file, columns=['SNP'], index=False, header=False)
	#!/usr/bin/python

	import sys
	import pandas as pd
	import numpy as np

	# arguments are fixed in the following format:
	lmiss_file = sys.argv[1]
	bim_file = sys.argv[2]
	output_file = sys.argv[3]

	# read .bim and .lmiss
	bim_header = ['CHR','SNP','DIST','POSITION','A1','A2']
	bim = pd.read_table(bim_file, header=None, names=bim_header)
	bim.drop(['DIST','A1', 'A2'], axis=1, inplace=True)

	lmiss = pd.read_table(lmiss_file, delim_whitespace=True)
	lmiss.drop(['N_MISS','N_GENO'], axis=1, inplace=True)

	# left join on lmiss and bim
	data = pd.merge(lmiss, bim, how='left', on=['SNP', 'CHR'])

	# get duplicates based on chr bp
	#
	# NOTE ON PROCESS:
	# Some groups have more than 2 rows, using an idxmax method would
	# only exclude one of the three. The duplicate with the least amount
	# of missing data, of any sized group, is identiied. Then filtered
	# out of duplicate dataframe.
	duplicates = data.groupby(['CHR','POSITION']).filter(lambda x: len(x) > 1)
	keep = duplicates.loc[duplicates.groupby(['CHR','POSITION'])['F_MISS'].idxmin(axis=1)]
	fails = duplicates[~duplicates['SNP'].isin(keep['SNP'])]

	# write list of fails
	fails.to_csv(output_file, columns=['SNP'], index=False, header=False)