endrebak/nrwas_analysis.py

## nrwas_analysis.py
def table(region, gwas_minus_region, total_gwas):

    pos_cols = "Chromosome Start End".split()
    total_region = len(region.drop_duplicates(pos_cols))

    tp = region.groupby("Trait").size()
    fp = (tp - total_region).abs()

    fn = gwas_minus_region.groupby("Trait").size()
    tn = ((fn + tp + fp) - total_gwas).abs()

    df = pd.concat([tp, fn, fp, tn], axis=1)
    df.columns = "TP FP TN FN".split()

    return df


def analyse_regions(region, gwas, name=None):

    # print("Finding the intersection between the GWAS and regions.", file=sys.stderr)

    region = gwas.overlap(region)

    if not len(region):
        print("No regions overlapping the GWAS catalog for {}.".format(name), file=sys.stderr)
        return pd.DataFrame()

    # need to remove those traits mapping multiple places to same genomic position
    drop_duplicates = lambda df: df.drop_duplicates("Chromosome Start End Trait".split())

    region = region.apply(drop_duplicates)

    gwas_minus_region = gwas.subtract(region).apply(drop_duplicates)

    # print("Finding the counts of SNPs in the regions for each of the {} terms.".format(len(region)), file=sys.stderr)
    df = table(region.df, gwas_minus_region.df, len(gwas.apply(drop_duplicates)))

    from pyranges.statistics import fisher_exact, fdr

    fe = fisher_exact(df.TP, df.FP, df.TN, df.FN, pseudocount=0.01)

    df.insert(0, "OR", fe.OR.values)
    df.insert(0, "P", fe.P.values)
    df.insert(0, "FDR", fdr(fe.P.values))

    df = df.sort_values("P")

    if not (name is None):
        df.insert(df.shape[1], "Name", name)

    return df
	def table(region, gwas_minus_region, total_gwas):

	pos_cols = "Chromosome Start End".split()
	total_region = len(region.drop_duplicates(pos_cols))

	tp = region.groupby("Trait").size()
	fp = (tp - total_region).abs()

	fn = gwas_minus_region.groupby("Trait").size()
	tn = ((fn + tp + fp) - total_gwas).abs()

	df = pd.concat([tp, fn, fp, tn], axis=1)
	df.columns = "TP FP TN FN".split()

	return df


	def analyse_regions(region, gwas, name=None):

	# print("Finding the intersection between the GWAS and regions.", file=sys.stderr)

	region = gwas.overlap(region)

	if not len(region):
	print("No regions overlapping the GWAS catalog for {}.".format(name), file=sys.stderr)
	return pd.DataFrame()

	# need to remove those traits mapping multiple places to same genomic position
	drop_duplicates = lambda df: df.drop_duplicates("Chromosome Start End Trait".split())

	region = region.apply(drop_duplicates)

	gwas_minus_region = gwas.subtract(region).apply(drop_duplicates)

	# print("Finding the counts of SNPs in the regions for each of the {} terms.".format(len(region)), file=sys.stderr)
	df = table(region.df, gwas_minus_region.df, len(gwas.apply(drop_duplicates)))

	from pyranges.statistics import fisher_exact, fdr

	fe = fisher_exact(df.TP, df.FP, df.TN, df.FN, pseudocount=0.01)

	df.insert(0, "OR", fe.OR.values)
	df.insert(0, "P", fe.P.values)
	df.insert(0, "FDR", fdr(fe.P.values))

	df = df.sort_values("P")

	if not (name is None):
	df.insert(df.shape[1], "Name", name)

	return df