Last active
December 11, 2019 12:01
-
-
Save endrebak/b49a10c3696f619e000ae65aae96ee68 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import pandas as pd | |
import numpy as np | |
import pyranges as pr | |
import argparse | |
import pathlib | |
from nrwas.version import __version__ | |
example_run = ("--version" in sys.argv or "-v" in sys.argv or "-ex" in sys.argv or "--example" in sys.argv) | |
using_sample_sheet = ("--sample-sheet" in sys.argv or "-ss" in sys.argv) | |
parser = argparse.ArgumentParser( | |
description="""nrwas version: {}""".format(__version__), | |
prog=os.path.basename(__file__)) | |
parser.add_argument( | |
'--gwas_path', | |
'-g', | |
required=True if not (example_run or using_sample_sheet) else False, | |
type=str, | |
help='''Path where the extended gwas catalog file is stored. If it does not exist it will be downloaded, | |
and this is the path where it will be stored. Missing paths are created.''') | |
parser.add_argument( | |
'--bed', | |
'-b', | |
required=True if not (example_run or using_sample_sheet) else False, | |
type=str, | |
help='''Bed file of regions to investigate for SNP enrichment.''' | |
) | |
parser.add_argument( | |
'--sample-sheet', | |
'-ss', | |
required=False, | |
type=str, | |
help='''Tab-separated file with two columns: File and Name, where the files are the bed files to analyse and the names are descriptions used in the output files..''' | |
) | |
parser.add_argument( | |
'--false-discovery-rate', | |
'-fdr', | |
required=False, | |
default=0.05, | |
type=float, | |
help='''Remove all terms with an FDR above this cutoff. The default 0.05.''') | |
parser.add_argument( | |
'--slack', | |
'-s', | |
required=False, | |
default=0, | |
type=int, | |
help='''Add this length to the end of each bed region. Default: 0.''') | |
parser.add_argument( | |
'--use-ld', | |
'-l', | |
required=False, | |
default=False, | |
action="store_true", | |
help='''Whether to look for enrichment in the LD-blocks overlapping the regions instead of just the regions themselves. Default: False.''' | |
) | |
parser.add_argument('--plot', '-p', required=False, type=str, | |
help="Path to store plot in. Allowed extensions: .html, .pdf, .svg.") | |
parser.add_argument('--top-k', '-k', required=False, type=int, | |
help="Number of top terms (in terms of OR) from each file to include in the output.") | |
parser.add_argument( | |
'--outfile', | |
'-o', | |
required=False, | |
type=str, | |
help='''File to store results in. Missing paths are created. Default: stdout.''' | |
) | |
parser.add_argument( | |
'--example', | |
'-x', | |
required=False, | |
type=str, | |
help='''Print example command and exit.''' | |
) | |
def get_gwas(gwas_path): | |
gwas_path = pathlib.Path(gwas_path) | |
if gwas_path.is_file(): | |
print("Reading the GWAS file {}.".format(args["gwas_path"]), file=sys.stderr) | |
gwas = pd.read_table(args["gwas_path"], low_memory=False) | |
else: | |
import pyranges_db as db | |
print("Downloading the GWAS Catalog data to '{}'".format(gwas_path), file=sys.stderr) | |
gwas = db.gwas_catalog.gwas_catalog() | |
gwas.to_csv(gwas_path, sep="\t") | |
gwas = gwas.astype({"DISEASE/TRAIT": "category", "MAPPED_TRAIT": "category"}) | |
gwas = pr.PyRanges(gwas)[["DISEASE/TRAIT", "MAPPED_TRAIT"]] | |
gwas = gwas.apply(lambda df: df.rename(columns={"DISEASE/TRAIT": "Trait"})) | |
gwas = gwas.apply(lambda df: df.drop_duplicates("Chromosome Start End Trait".split())) | |
gwas.Chromosome = "chr" + gwas.Chromosome.astype(str) | |
return gwas | |
def get_regions(region_path, slack=0): | |
region = pr.read_bed(region_path).slack(slack) | |
if not len(region): | |
print("No regions in the bed file {}.".format(region_path), file=sys.stderr) | |
pd.DataFrame().to_csv(outfile) | |
sys.exit(0) | |
return region | |
def table(region, gwas): | |
pos_cols = "Chromosome Start End".split() | |
total_region = len(region.drop_duplicates(pos_cols)) | |
total_gwas = len(gwas.drop_duplicates(pos_cols)) | |
tp = region.groupby("Trait").size() | |
fp = (tp - total_region).abs() | |
fn = gwas.groupby("Trait").size() | |
tn = (fn - total_gwas).abs() | |
df = pd.concat([tp, fn, fp, tn], axis=1) | |
df.columns = "TP FP TN FN".split() | |
return df | |
def find_major_terms(df, k=5): | |
dfs = [] | |
for n, ndf in df.groupby("Name"): | |
ndf = ndf[["Trait", "OR", "FDR", "Name"]].reset_index(drop=True) | |
ndf = ndf.sort_values("OR") | |
top = pd.Series([1] * k + [0] * (len(ndf) - k), dtype=bool) | |
ndf.insert(ndf.shape[1], "Top", top) | |
dfs.append(ndf) | |
if dfs: | |
df = pd.concat(dfs) | |
else: | |
return pd.DataFrame() | |
top_traits = df[df.Top == 1].Trait.values | |
df = df[df.Trait.isin(top_traits)] | |
# df.to_csv("test.csv") | |
# topname = df.drop_duplicates(["Trait", "Top"])[["Trait", "Name"]] | |
# df = df.merge(topname, on="Trait", suffixes=("", "Top")) | |
# print(df.head()) | |
new_desc = df.pop("Trait").str.slice(0, 50).str.strip() | |
df.insert(0, "Trait", new_desc) | |
df.loc[:, "OR"] = df.OR.replace(np.inf, df.OR.max()) | |
return df.reset_index(drop=True) | |
def analyse_regions(region, gwas, name=None): | |
# print("Finding the intersection between the GWAS and regions.", file=sys.stderr) | |
region = gwas.overlap(region) | |
if not len(region): | |
print("No regions overlapping the GWAS catalog for {}.".format(name), file=sys.stderr) | |
return pd.DataFrame() | |
region = region.apply(lambda df: df.drop_duplicates("Trait")) | |
gwas_minus_region = gwas.subtract(region) | |
# print("Finding the counts of SNPs in the regions for each of the {} terms.".format(len(region)), file=sys.stderr) | |
df = table(region.df, gwas_minus_region.df) | |
from pyranges.statistics import fisher_exact, fdr | |
fe = fisher_exact(df.TP, df.FP, df.TN, df.FN, pseudocount=0.01) | |
df.insert(0, "OR", fe.OR.values) | |
df.insert(0, "P", fe.P.values) | |
df.insert(0, "FDR", fdr(fe.P.values)) | |
df = df.sort_values("P") | |
if not (name is None): | |
df.insert(df.shape[1], "Name", name) | |
return df | |
if __name__ == "__main__": | |
args = vars(parser.parse_args()) | |
plot = args["plot"] | |
if plot: | |
import plotly | |
if args["example"]: | |
print("rwas", file=sys.stderr) | |
outfile = args["outfile"] | |
if outfile: | |
pathlib.Path.mkdir(pathlib.Path(outfile).parent, exist_ok=True, parents=True) | |
else: | |
outfile = sys.stdout | |
gwas = get_gwas(args["gwas_path"]) | |
use_ld = args["use_ld"] | |
slack = args["slack"] | |
if not (slack is None) and slack > 0: | |
print("Adding a slack of {} to the regions.".format(slack), file=sys.stderr) | |
if use_ld: | |
print("Using ld-blocks from population EUR.", file=sys.stderr) | |
this_path = pathlib.Path(__file__).resolve().parent.parent | |
filepath = str(this_path / "nrwas/EUR_hg38_fixed.bed.gz") | |
ld = pr.read_bed(filepath) | |
else: | |
print("Not using ld-blocks.", file=sys.stderr) | |
if args["bed"]: | |
files = [args["bed"]] | |
names = [None] | |
elif args["sample_sheet"]: | |
ss = pd.read_table(args["sample_sheet"], sep="\t") | |
files, names = ss.File, ss.Name | |
results = [] | |
for f, n in zip(files, names): | |
print("Reading the region file {}.".format(f), file=sys.stderr) | |
region = get_regions(f, slack) | |
if use_ld: | |
region = ld.overlap(region).drop_duplicate_positions() | |
result_ = analyse_regions(region, gwas, n if n else f) | |
results.append(result_) | |
df = pd.concat(results) | |
terms_analyzed = len(df) | |
if args["false_discovery_rate"]: | |
df = df[df.FDR <= args["false_discovery_rate"]] | |
df = df.sort_values("OR", ascending=False) | |
if plot: | |
pathlib.Path.mkdir(pathlib.Path(plot).parent, exist_ok=True, parents=True) | |
df = df.reset_index() | |
if args["top_k"]: | |
df = find_major_terms(df, args["top_k"]) | |
df.to_csv(outfile, sep="\t", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment