endrebak/nrwas.py

## nrwas.py
#!/usr/bin/env python

import os
import sys

import pandas as pd
import numpy as np
import pyranges as pr

import argparse


import pathlib

from nrwas.version import __version__

example_run = ("--version" in sys.argv or "-v" in sys.argv or "-ex" in sys.argv or "--example" in sys.argv)
using_sample_sheet = ("--sample-sheet" in sys.argv or "-ss" in sys.argv)

parser = argparse.ArgumentParser(
    description="""nrwas version: {}""".format(__version__),
    prog=os.path.basename(__file__))


parser.add_argument(
    '--gwas_path',
    '-g',
    required=True if not (example_run or using_sample_sheet) else False,
    type=str,
    help='''Path where the extended gwas catalog file is stored. If it does not exist it will be downloaded,
and this is the path where it will be stored. Missing paths are created.''')


parser.add_argument(
    '--bed',
    '-b',
    required=True if not (example_run or using_sample_sheet) else False,
    type=str,
    help='''Bed file of regions to investigate for SNP enrichment.'''
)

parser.add_argument(
    '--sample-sheet',
    '-ss',
    required=False,
    type=str,
    help='''Tab-separated file with two columns: File and Name, where the files are the bed files to analyse and the names are descriptions used in the output files..'''
)


parser.add_argument(
    '--false-discovery-rate',
    '-fdr',
    required=False,
    default=0.05,
    type=float,
    help='''Remove all terms with an FDR above this cutoff. The default 0.05.''')


parser.add_argument(
    '--slack',
    '-s',
    required=False,
    default=0,
    type=int,
    help='''Add this length to the end of each bed region. Default: 0.''')

parser.add_argument(
    '--use-ld',
    '-l',
    required=False,
    default=False,
    action="store_true",
    help='''Whether to look for enrichment in the LD-blocks overlapping the regions instead of just the regions themselves. Default: False.'''
)

parser.add_argument('--plot', '-p', required=False, type=str,
                    help="Path to store plot in. Allowed extensions: .html, .pdf, .svg.")

parser.add_argument('--top-k', '-k', required=False, type=int,
                    help="Number of top terms (in terms of OR) from each file to include in the output.")

parser.add_argument(
    '--outfile',
    '-o',
    required=False,
    type=str,
    help='''File to store results in. Missing paths are created. Default: stdout.'''
)


parser.add_argument(
    '--example',
    '-x',
    required=False,
    type=str,
    help='''Print example command and exit.'''
)


def get_gwas(gwas_path):

    gwas_path = pathlib.Path(gwas_path)

    if gwas_path.is_file():
        print("Reading the GWAS file {}.".format(args["gwas_path"]), file=sys.stderr)
        gwas = pd.read_table(args["gwas_path"], low_memory=False)
    else:
        import pyranges_db as db
        print("Downloading the GWAS Catalog data to '{}'".format(gwas_path), file=sys.stderr)
        gwas = db.gwas_catalog.gwas_catalog()
        gwas.to_csv(gwas_path, sep="\t")

    gwas = gwas.astype({"DISEASE/TRAIT": "category", "MAPPED_TRAIT": "category"})
    gwas = pr.PyRanges(gwas)[["DISEASE/TRAIT", "MAPPED_TRAIT"]]

    gwas = gwas.apply(lambda df: df.rename(columns={"DISEASE/TRAIT": "Trait"}))
    gwas = gwas.apply(lambda df: df.drop_duplicates("Chromosome Start End Trait".split()))

    gwas.Chromosome = "chr" + gwas.Chromosome.astype(str)

    return gwas

def get_regions(region_path, slack=0):

    region = pr.read_bed(region_path).slack(slack)
    if not len(region):
        print("No regions in the bed file {}.".format(region_path), file=sys.stderr)
        pd.DataFrame().to_csv(outfile)
        sys.exit(0)

    return region

def table(region, gwas):

    pos_cols = "Chromosome Start End".split()
    total_region = len(region.drop_duplicates(pos_cols))
    total_gwas = len(gwas.drop_duplicates(pos_cols))

    tp = region.groupby("Trait").size()
    fp = (tp - total_region).abs()

    fn = gwas.groupby("Trait").size()
    tn = (fn - total_gwas).abs()

    df = pd.concat([tp, fn, fp, tn], axis=1)
    df.columns = "TP FP TN FN".split()

    return df


def find_major_terms(df, k=5):

    dfs = []

    for n, ndf in df.groupby("Name"):

        ndf = ndf[["Trait", "OR", "FDR", "Name"]].reset_index(drop=True)

        ndf = ndf.sort_values("OR")

        top = pd.Series([1] * k + [0] * (len(ndf) - k), dtype=bool)
        ndf.insert(ndf.shape[1], "Top", top)

        dfs.append(ndf)

    if dfs:
        df = pd.concat(dfs)
    else:
        return pd.DataFrame()

    top_traits = df[df.Top == 1].Trait.values
    df = df[df.Trait.isin(top_traits)]
    # df.to_csv("test.csv")
    # topname = df.drop_duplicates(["Trait", "Top"])[["Trait", "Name"]]
    # df = df.merge(topname, on="Trait", suffixes=("", "Top"))
    # print(df.head())

    new_desc = df.pop("Trait").str.slice(0, 50).str.strip()

    df.insert(0, "Trait", new_desc)

    df.loc[:, "OR"] = df.OR.replace(np.inf, df.OR.max())

    return df.reset_index(drop=True)

def analyse_regions(region, gwas, name=None):

    # print("Finding the intersection between the GWAS and regions.", file=sys.stderr)

    region = gwas.overlap(region)

    if not len(region):
        print("No regions overlapping the GWAS catalog for {}.".format(name), file=sys.stderr)
        return pd.DataFrame()

    region = region.apply(lambda df: df.drop_duplicates("Trait"))

    gwas_minus_region = gwas.subtract(region)

    # print("Finding the counts of SNPs in the regions for each of the {} terms.".format(len(region)), file=sys.stderr)
    df = table(region.df, gwas_minus_region.df)

    from pyranges.statistics import fisher_exact, fdr

    fe = fisher_exact(df.TP, df.FP, df.TN, df.FN, pseudocount=0.01)

    df.insert(0, "OR", fe.OR.values)
    df.insert(0, "P", fe.P.values)
    df.insert(0, "FDR", fdr(fe.P.values))

    df = df.sort_values("P")

    if not (name is None):
        df.insert(df.shape[1], "Name", name)

    return df


if __name__ == "__main__":

    args = vars(parser.parse_args())

    plot = args["plot"]
    if plot:
        import plotly

    if args["example"]:
        print("rwas", file=sys.stderr)

    outfile = args["outfile"]
    if outfile:
        pathlib.Path.mkdir(pathlib.Path(outfile).parent, exist_ok=True, parents=True)
    else:
        outfile = sys.stdout

    gwas = get_gwas(args["gwas_path"])

    use_ld = args["use_ld"]
    slack = args["slack"]

    if not (slack is None) and slack > 0:
        print("Adding a slack of {} to the regions.".format(slack), file=sys.stderr)

    if use_ld:
        print("Using ld-blocks from population EUR.", file=sys.stderr)
        this_path = pathlib.Path(__file__).resolve().parent.parent
        filepath = str(this_path / "nrwas/EUR_hg38_fixed.bed.gz")

        ld = pr.read_bed(filepath)
    else:
        print("Not using ld-blocks.", file=sys.stderr)


    if args["bed"]:

        files = [args["bed"]]
        names = [None]

    elif args["sample_sheet"]:

        ss = pd.read_table(args["sample_sheet"], sep="\t")
        files, names = ss.File, ss.Name

    results = []
    for f, n in zip(files, names):

        print("Reading the region file {}.".format(f), file=sys.stderr)

        region = get_regions(f, slack)

        if use_ld:
            region = ld.overlap(region).drop_duplicate_positions()

        result_ = analyse_regions(region, gwas, n if n else f)
        results.append(result_)

    df = pd.concat(results)

    terms_analyzed = len(df)

    if args["false_discovery_rate"]:
        df = df[df.FDR <= args["false_discovery_rate"]]

    df = df.sort_values("OR", ascending=False)

    if plot:

        pathlib.Path.mkdir(pathlib.Path(plot).parent, exist_ok=True, parents=True)

    df = df.reset_index()

    if args["top_k"]:
        df = find_major_terms(df, args["top_k"])

    df.to_csv(outfile, sep="\t", index=False)
	#!/usr/bin/env python

	import os
	import sys

	import pandas as pd
	import numpy as np
	import pyranges as pr

	import argparse


	import pathlib

	from nrwas.version import __version__

	example_run = ("--version" in sys.argv or "-v" in sys.argv or "-ex" in sys.argv or "--example" in sys.argv)
	using_sample_sheet = ("--sample-sheet" in sys.argv or "-ss" in sys.argv)

	parser = argparse.ArgumentParser(
	description="""nrwas version: {}""".format(__version__),
	prog=os.path.basename(__file__))


	parser.add_argument(
	'--gwas_path',
	'-g',
	required=True if not (example_run or using_sample_sheet) else False,
	type=str,
	help='''Path where the extended gwas catalog file is stored. If it does not exist it will be downloaded,
	and this is the path where it will be stored. Missing paths are created.''')


	parser.add_argument(
	'--bed',
	'-b',
	required=True if not (example_run or using_sample_sheet) else False,
	type=str,
	help='''Bed file of regions to investigate for SNP enrichment.'''
	)

	parser.add_argument(
	'--sample-sheet',
	'-ss',
	required=False,
	type=str,
	help='''Tab-separated file with two columns: File and Name, where the files are the bed files to analyse and the names are descriptions used in the output files..'''
	)


	parser.add_argument(
	'--false-discovery-rate',
	'-fdr',
	required=False,
	default=0.05,
	type=float,
	help='''Remove all terms with an FDR above this cutoff. The default 0.05.''')


	parser.add_argument(
	'--slack',
	'-s',
	required=False,
	default=0,
	type=int,
	help='''Add this length to the end of each bed region. Default: 0.''')

	parser.add_argument(
	'--use-ld',
	'-l',
	required=False,
	default=False,
	action="store_true",
	help='''Whether to look for enrichment in the LD-blocks overlapping the regions instead of just the regions themselves. Default: False.'''
	)

	parser.add_argument('--plot', '-p', required=False, type=str,
	help="Path to store plot in. Allowed extensions: .html, .pdf, .svg.")

	parser.add_argument('--top-k', '-k', required=False, type=int,
	help="Number of top terms (in terms of OR) from each file to include in the output.")

	parser.add_argument(
	'--outfile',
	'-o',
	required=False,
	type=str,
	help='''File to store results in. Missing paths are created. Default: stdout.'''
	)


	parser.add_argument(
	'--example',
	'-x',
	required=False,
	type=str,
	help='''Print example command and exit.'''
	)


	def get_gwas(gwas_path):

	gwas_path = pathlib.Path(gwas_path)

	if gwas_path.is_file():
	print("Reading the GWAS file {}.".format(args["gwas_path"]), file=sys.stderr)
	gwas = pd.read_table(args["gwas_path"], low_memory=False)
	else:
	import pyranges_db as db
	print("Downloading the GWAS Catalog data to '{}'".format(gwas_path), file=sys.stderr)
	gwas = db.gwas_catalog.gwas_catalog()
	gwas.to_csv(gwas_path, sep="\t")

	gwas = gwas.astype({"DISEASE/TRAIT": "category", "MAPPED_TRAIT": "category"})
	gwas = pr.PyRanges(gwas)[["DISEASE/TRAIT", "MAPPED_TRAIT"]]

	gwas = gwas.apply(lambda df: df.rename(columns={"DISEASE/TRAIT": "Trait"}))
	gwas = gwas.apply(lambda df: df.drop_duplicates("Chromosome Start End Trait".split()))

	gwas.Chromosome = "chr" + gwas.Chromosome.astype(str)

	return gwas

	def get_regions(region_path, slack=0):

	region = pr.read_bed(region_path).slack(slack)
	if not len(region):
	print("No regions in the bed file {}.".format(region_path), file=sys.stderr)
	pd.DataFrame().to_csv(outfile)
	sys.exit(0)

	return region

	def table(region, gwas):

	pos_cols = "Chromosome Start End".split()
	total_region = len(region.drop_duplicates(pos_cols))
	total_gwas = len(gwas.drop_duplicates(pos_cols))

	tp = region.groupby("Trait").size()
	fp = (tp - total_region).abs()

	fn = gwas.groupby("Trait").size()
	tn = (fn - total_gwas).abs()

	df = pd.concat([tp, fn, fp, tn], axis=1)
	df.columns = "TP FP TN FN".split()

	return df


	def find_major_terms(df, k=5):

	dfs = []

	for n, ndf in df.groupby("Name"):

	ndf = ndf[["Trait", "OR", "FDR", "Name"]].reset_index(drop=True)

	ndf = ndf.sort_values("OR")

	top = pd.Series([1] * k + [0] * (len(ndf) - k), dtype=bool)
	ndf.insert(ndf.shape[1], "Top", top)

	dfs.append(ndf)

	if dfs:
	df = pd.concat(dfs)
	else:
	return pd.DataFrame()

	top_traits = df[df.Top == 1].Trait.values
	df = df[df.Trait.isin(top_traits)]
	# df.to_csv("test.csv")
	# topname = df.drop_duplicates(["Trait", "Top"])[["Trait", "Name"]]
	# df = df.merge(topname, on="Trait", suffixes=("", "Top"))
	# print(df.head())

	new_desc = df.pop("Trait").str.slice(0, 50).str.strip()

	df.insert(0, "Trait", new_desc)

	df.loc[:, "OR"] = df.OR.replace(np.inf, df.OR.max())

	return df.reset_index(drop=True)

	def analyse_regions(region, gwas, name=None):

	# print("Finding the intersection between the GWAS and regions.", file=sys.stderr)

	region = gwas.overlap(region)

	if not len(region):
	print("No regions overlapping the GWAS catalog for {}.".format(name), file=sys.stderr)
	return pd.DataFrame()

	region = region.apply(lambda df: df.drop_duplicates("Trait"))

	gwas_minus_region = gwas.subtract(region)

	# print("Finding the counts of SNPs in the regions for each of the {} terms.".format(len(region)), file=sys.stderr)
	df = table(region.df, gwas_minus_region.df)

	from pyranges.statistics import fisher_exact, fdr

	fe = fisher_exact(df.TP, df.FP, df.TN, df.FN, pseudocount=0.01)

	df.insert(0, "OR", fe.OR.values)
	df.insert(0, "P", fe.P.values)
	df.insert(0, "FDR", fdr(fe.P.values))

	df = df.sort_values("P")

	if not (name is None):
	df.insert(df.shape[1], "Name", name)

	return df


	if __name__ == "__main__":

	args = vars(parser.parse_args())

	plot = args["plot"]
	if plot:
	import plotly

	if args["example"]:
	print("rwas", file=sys.stderr)

	outfile = args["outfile"]
	if outfile:
	pathlib.Path.mkdir(pathlib.Path(outfile).parent, exist_ok=True, parents=True)
	else:
	outfile = sys.stdout

	gwas = get_gwas(args["gwas_path"])

	use_ld = args["use_ld"]
	slack = args["slack"]

	if not (slack is None) and slack > 0:
	print("Adding a slack of {} to the regions.".format(slack), file=sys.stderr)

	if use_ld:
	print("Using ld-blocks from population EUR.", file=sys.stderr)
	this_path = pathlib.Path(__file__).resolve().parent.parent
	filepath = str(this_path / "nrwas/EUR_hg38_fixed.bed.gz")

	ld = pr.read_bed(filepath)
	else:
	print("Not using ld-blocks.", file=sys.stderr)


	if args["bed"]:

	files = [args["bed"]]
	names = [None]

	elif args["sample_sheet"]:

	ss = pd.read_table(args["sample_sheet"], sep="\t")
	files, names = ss.File, ss.Name

	results = []
	for f, n in zip(files, names):

	print("Reading the region file {}.".format(f), file=sys.stderr)

	region = get_regions(f, slack)

	if use_ld:
	region = ld.overlap(region).drop_duplicate_positions()

	result_ = analyse_regions(region, gwas, n if n else f)
	results.append(result_)

	df = pd.concat(results)

	terms_analyzed = len(df)

	if args["false_discovery_rate"]:
	df = df[df.FDR <= args["false_discovery_rate"]]

	df = df.sort_values("OR", ascending=False)

	if plot:

	pathlib.Path.mkdir(pathlib.Path(plot).parent, exist_ok=True, parents=True)

	df = df.reset_index()

	if args["top_k"]:
	df = find_major_terms(df, args["top_k"])

	df.to_csv(outfile, sep="\t", index=False)