Skip to content

Instantly share code, notes, and snippets.

@endrebak
Last active December 11, 2019 12:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save endrebak/b49a10c3696f619e000ae65aae96ee68 to your computer and use it in GitHub Desktop.
Save endrebak/b49a10c3696f619e000ae65aae96ee68 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import os
import sys
import pandas as pd
import numpy as np
import pyranges as pr
import argparse
import pathlib
from nrwas.version import __version__
example_run = ("--version" in sys.argv or "-v" in sys.argv or "-ex" in sys.argv or "--example" in sys.argv)
using_sample_sheet = ("--sample-sheet" in sys.argv or "-ss" in sys.argv)
parser = argparse.ArgumentParser(
description="""nrwas version: {}""".format(__version__),
prog=os.path.basename(__file__))
parser.add_argument(
'--gwas_path',
'-g',
required=True if not (example_run or using_sample_sheet) else False,
type=str,
help='''Path where the extended gwas catalog file is stored. If it does not exist it will be downloaded,
and this is the path where it will be stored. Missing paths are created.''')
parser.add_argument(
'--bed',
'-b',
required=True if not (example_run or using_sample_sheet) else False,
type=str,
help='''Bed file of regions to investigate for SNP enrichment.'''
)
parser.add_argument(
'--sample-sheet',
'-ss',
required=False,
type=str,
help='''Tab-separated file with two columns: File and Name, where the files are the bed files to analyse and the names are descriptions used in the output files..'''
)
parser.add_argument(
'--false-discovery-rate',
'-fdr',
required=False,
default=0.05,
type=float,
help='''Remove all terms with an FDR above this cutoff. The default 0.05.''')
parser.add_argument(
'--slack',
'-s',
required=False,
default=0,
type=int,
help='''Add this length to the end of each bed region. Default: 0.''')
parser.add_argument(
'--use-ld',
'-l',
required=False,
default=False,
action="store_true",
help='''Whether to look for enrichment in the LD-blocks overlapping the regions instead of just the regions themselves. Default: False.'''
)
parser.add_argument('--plot', '-p', required=False, type=str,
help="Path to store plot in. Allowed extensions: .html, .pdf, .svg.")
parser.add_argument('--top-k', '-k', required=False, type=int,
help="Number of top terms (in terms of OR) from each file to include in the output.")
parser.add_argument(
'--outfile',
'-o',
required=False,
type=str,
help='''File to store results in. Missing paths are created. Default: stdout.'''
)
parser.add_argument(
'--example',
'-x',
required=False,
type=str,
help='''Print example command and exit.'''
)
def get_gwas(gwas_path):
gwas_path = pathlib.Path(gwas_path)
if gwas_path.is_file():
print("Reading the GWAS file {}.".format(args["gwas_path"]), file=sys.stderr)
gwas = pd.read_table(args["gwas_path"], low_memory=False)
else:
import pyranges_db as db
print("Downloading the GWAS Catalog data to '{}'".format(gwas_path), file=sys.stderr)
gwas = db.gwas_catalog.gwas_catalog()
gwas.to_csv(gwas_path, sep="\t")
gwas = gwas.astype({"DISEASE/TRAIT": "category", "MAPPED_TRAIT": "category"})
gwas = pr.PyRanges(gwas)[["DISEASE/TRAIT", "MAPPED_TRAIT"]]
gwas = gwas.apply(lambda df: df.rename(columns={"DISEASE/TRAIT": "Trait"}))
gwas = gwas.apply(lambda df: df.drop_duplicates("Chromosome Start End Trait".split()))
gwas.Chromosome = "chr" + gwas.Chromosome.astype(str)
return gwas
def get_regions(region_path, slack=0):
region = pr.read_bed(region_path).slack(slack)
if not len(region):
print("No regions in the bed file {}.".format(region_path), file=sys.stderr)
pd.DataFrame().to_csv(outfile)
sys.exit(0)
return region
def table(region, gwas):
pos_cols = "Chromosome Start End".split()
total_region = len(region.drop_duplicates(pos_cols))
total_gwas = len(gwas.drop_duplicates(pos_cols))
tp = region.groupby("Trait").size()
fp = (tp - total_region).abs()
fn = gwas.groupby("Trait").size()
tn = (fn - total_gwas).abs()
df = pd.concat([tp, fn, fp, tn], axis=1)
df.columns = "TP FP TN FN".split()
return df
def find_major_terms(df, k=5):
dfs = []
for n, ndf in df.groupby("Name"):
ndf = ndf[["Trait", "OR", "FDR", "Name"]].reset_index(drop=True)
ndf = ndf.sort_values("OR")
top = pd.Series([1] * k + [0] * (len(ndf) - k), dtype=bool)
ndf.insert(ndf.shape[1], "Top", top)
dfs.append(ndf)
if dfs:
df = pd.concat(dfs)
else:
return pd.DataFrame()
top_traits = df[df.Top == 1].Trait.values
df = df[df.Trait.isin(top_traits)]
# df.to_csv("test.csv")
# topname = df.drop_duplicates(["Trait", "Top"])[["Trait", "Name"]]
# df = df.merge(topname, on="Trait", suffixes=("", "Top"))
# print(df.head())
new_desc = df.pop("Trait").str.slice(0, 50).str.strip()
df.insert(0, "Trait", new_desc)
df.loc[:, "OR"] = df.OR.replace(np.inf, df.OR.max())
return df.reset_index(drop=True)
def analyse_regions(region, gwas, name=None):
# print("Finding the intersection between the GWAS and regions.", file=sys.stderr)
region = gwas.overlap(region)
if not len(region):
print("No regions overlapping the GWAS catalog for {}.".format(name), file=sys.stderr)
return pd.DataFrame()
region = region.apply(lambda df: df.drop_duplicates("Trait"))
gwas_minus_region = gwas.subtract(region)
# print("Finding the counts of SNPs in the regions for each of the {} terms.".format(len(region)), file=sys.stderr)
df = table(region.df, gwas_minus_region.df)
from pyranges.statistics import fisher_exact, fdr
fe = fisher_exact(df.TP, df.FP, df.TN, df.FN, pseudocount=0.01)
df.insert(0, "OR", fe.OR.values)
df.insert(0, "P", fe.P.values)
df.insert(0, "FDR", fdr(fe.P.values))
df = df.sort_values("P")
if not (name is None):
df.insert(df.shape[1], "Name", name)
return df
if __name__ == "__main__":
args = vars(parser.parse_args())
plot = args["plot"]
if plot:
import plotly
if args["example"]:
print("rwas", file=sys.stderr)
outfile = args["outfile"]
if outfile:
pathlib.Path.mkdir(pathlib.Path(outfile).parent, exist_ok=True, parents=True)
else:
outfile = sys.stdout
gwas = get_gwas(args["gwas_path"])
use_ld = args["use_ld"]
slack = args["slack"]
if not (slack is None) and slack > 0:
print("Adding a slack of {} to the regions.".format(slack), file=sys.stderr)
if use_ld:
print("Using ld-blocks from population EUR.", file=sys.stderr)
this_path = pathlib.Path(__file__).resolve().parent.parent
filepath = str(this_path / "nrwas/EUR_hg38_fixed.bed.gz")
ld = pr.read_bed(filepath)
else:
print("Not using ld-blocks.", file=sys.stderr)
if args["bed"]:
files = [args["bed"]]
names = [None]
elif args["sample_sheet"]:
ss = pd.read_table(args["sample_sheet"], sep="\t")
files, names = ss.File, ss.Name
results = []
for f, n in zip(files, names):
print("Reading the region file {}.".format(f), file=sys.stderr)
region = get_regions(f, slack)
if use_ld:
region = ld.overlap(region).drop_duplicate_positions()
result_ = analyse_regions(region, gwas, n if n else f)
results.append(result_)
df = pd.concat(results)
terms_analyzed = len(df)
if args["false_discovery_rate"]:
df = df[df.FDR <= args["false_discovery_rate"]]
df = df.sort_values("OR", ascending=False)
if plot:
pathlib.Path.mkdir(pathlib.Path(plot).parent, exist_ok=True, parents=True)
df = df.reset_index()
if args["top_k"]:
df = find_major_terms(df, args["top_k"])
df.to_csv(outfile, sep="\t", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment