andersgs/dummy_data.csv

## dummy_data.csv

          
            GROUP
            LOCUS1 
            LOCUS2
            LOCUS3

            
              A
              1
              1
              2

            
              A
              2
              1
              2

            
              B
              1
              3
              2

            
              B
              2
              4
              2

            
              B
              2
              4
              1

## unique_alleles.py
'''
Find unique alleles to a group
'''

import click
import pandas as pd

def load_data(filename):
    '''
    Load CSV in to a pandas.DataFrame
    '''
    return pd.read_csv(filename, engine="python", sep=None)

def find_unique(tab):
    '''
    1. One-hot encode the data.
    2. Groub By the "GROUP" variable --- this assumes that you have only two groups an IN and an OUT, or an A and B groups
    3. Find the proportion of individuals in each group that have a particular allele in each of the two groups
    4. If group of interest is A, identify those columns that have proportion 1.0 for the allele in A, and 0.0 in group B.
    5. Return a Boolean vector
    '''
    tab_one = pd.get_dummies(tab, columns=list(tab.columns[tab.columns.str.startswith("LOCUS")]))
    return tab_one.groupby("GROUP").apply(lambda x: x.sum()/x.count()).apply(lambda x: True if x.A == 1.0 and x.B == 0.0 else False)


@click.command()
@click.argument("input_file")
def run_finder(input_file):
    tab = load_data(input_file)
    unq = find_unique(tab)
    print(unq)

if __name__ == "__main__":
    run_finder()
	'''
	Find unique alleles to a group
	'''

	import click
	import pandas as pd

	def load_data(filename):
	'''
	Load CSV in to a pandas.DataFrame
	'''
	return pd.read_csv(filename, engine="python", sep=None)

	def find_unique(tab):
	'''
	1. One-hot encode the data.
	2. Groub By the "GROUP" variable --- this assumes that you have only two groups an IN and an OUT, or an A and B groups
	3. Find the proportion of individuals in each group that have a particular allele in each of the two groups
	4. If group of interest is A, identify those columns that have proportion 1.0 for the allele in A, and 0.0 in group B.
	5. Return a Boolean vector
	'''
	tab_one = pd.get_dummies(tab, columns=list(tab.columns[tab.columns.str.startswith("LOCUS")]))
	return tab_one.groupby("GROUP").apply(lambda x: x.sum()/x.count()).apply(lambda x: True if x.A == 1.0 and x.B == 0.0 else False)


	@click.command()
	@click.argument("input_file")
	def run_finder(input_file):
	tab = load_data(input_file)
	unq = find_unique(tab)
	print(unq)

	if __name__ == "__main__":
	run_finder()