Created
August 26, 2014 21:47
-
-
Save brentp/4ec866b5ab3dd4e7bc46 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
from collections import Counter | |
import toolshed as ts | |
import itertools as it | |
import click | |
import sys | |
import re | |
import array | |
from math import log10 | |
from pyper import R as Rpyper | |
R = Rpyper(use_numpy=False) | |
R.use_dict = True | |
R('source("src/hapskat.R")') | |
@click.command() | |
@click.argument('vcf', type=click.Path(exists=True)) | |
@click.argument('cov', type=click.Path(exists=True)) | |
@click.argument('formula') | |
@click.option('--min-snps', default=4, help="don't test a block unless it" | |
" has at least this many variants.") | |
@click.option('--max-missing', default=10, help="exclude variants with more" | |
" missing variants than this.") | |
def hapskat(vcf, cov, formula, min_snps, max_missing): | |
fmt = "{chrom}\t{start}\t{end}\t{chrom}:{start}-{end}\t\t{f_p_value}\t{p_value}\t{n_tested}\t{haplotypes}\t{rare_haplotypes}" | |
block_iter = read_blocks(vcf, min_snps, max_missing) | |
header = next(block_iter) | |
samples = header[9:] | |
print(R("null.obj <- hap.skat.null(%s, '%s')" % (formula, cov)), | |
file=sys.stderr) | |
R('wgts <- NULL') | |
print("#" + fmt.replace("}", "").replace("{", "")) | |
for i, block in enumerate(block_iter): | |
Z, rare_haps = merge_rare(getZ(block, header)) | |
assert [r[0] for r in Z[1:]] == samples, ("bad order") | |
# remove column and row labels | |
Zsub = [row[1:] for row in Z[1:]] | |
send_array(Zsub) | |
#R['wgts'] = wgts | |
r = R("res = NA; Z = read.bin('/tmp/t.bin'); res = hap.skat(Z, null.obj, y, wgts)").split("\n") | |
r = [x.strip() for x in r if x.strip() and not x.startswith('try')] | |
if r: | |
print("\n".join(r), file=sys.stderr) | |
res = R['res'] | |
locs = get_locs(block) | |
if len(locs) != len(set(locs)): | |
print(len(locs), len(set(locs)), file=sys.stderr) | |
print(res, file=sys.stderr) | |
res['chrom'] = locs[0].split(":")[0] | |
res['start'] = locs[0].split(":")[1] | |
res['end'] = locs[-1].split(":")[1] | |
res['haplotypes'] = "|".join(Z[0][i] for i in range(1, len(Z[0])) if | |
not Z[0][i].startswith("rare")) | |
res['rare_haplotypes'] = "|".join(rare_haps) or "NA" | |
print(fmt.format(**res)) | |
print(i + 1, "tests", file=sys.stderr) | |
def merge_rare(Z): | |
# Z is row and column headers is of shape n_samples * n_haplotypes. | |
# use 1/sqrt(2 * n_samples) as cutoff--from SKAT_CommonRare. | |
rare_cutoff = 1.0 / (2.0 * len(Z)) ** 0.5 | |
col_freqs = [sum(row[i] for row in Z[1:]) / float(len(Z) - 1) for i in range(1, len(Z[0]))] | |
rare_idxs = [i for i, f in enumerate(col_freqs) if f < rare_cutoff] | |
# since we know that Z has the most common columns first, we can simplify | |
# the stuff below to just use the first rare column | |
# just check that we don't get, e.g. 2,4,5 instaed of 2,3,4,5 | |
if True: #rare_idxs == []: | |
return Z, [] | |
assert rare_idxs == range(min(rare_idxs), max(rare_idxs) + 1) | |
Znew = [[Z[0][0]] + [Z[0][i + 1] for i in range(rare_idxs[0])] + ['rare_sum']] | |
for row in Z[1:]: | |
# the row label and the common haplotypes. | |
new_row = [row[0]] + [row[i + 1] for i in range(rare_idxs[0])] | |
# the merged (summed) rare haplotypes go to a single column | |
new_row.append(min(2, sum(row[i + 1] for i in rare_idxs))) | |
Znew.append(new_row) | |
# return merged data along with the list of rare haplotypes. | |
return Znew, [Z[0][i + 1] for i in rare_idxs] | |
def getZ(block, header): | |
sample_haps = [] | |
sample_haps.extend((get_phases(block, i) for i in range(9, len(header)))) | |
c = Counter() | |
for row in sample_haps: | |
c.update(row) | |
# TODO: handle haplotypes containing 'X' (genotype of NA within hap) | |
Z = [['sample'] + ["%s" % x[0] for x in c.most_common()]] | |
for i, row in enumerate(sample_haps): | |
# e.g. if it has same hap from both parents then | |
# it will have a score of Z for that haplotype | |
Z.append([header[9 + i]] + [0.5 * row.count(h) for h in Z[0][1:]]) | |
return Z | |
def tryint(x): | |
try: return int(x) | |
except ValueError: return -1 | |
def get_locs(block): | |
return ["%s:%i" % (b[0], int(b[1])) for b in block] | |
def get_phases(block, idx, sep = re.compile("\||/")): | |
""" | |
use the 0|1 in the genotype column to look up the ref/alt bases | |
""" | |
refalts = [b[3].split(",") + b[4].split(",") + ["X"] for b in block] | |
genosM = [tryint(sep.split(v[idx].split(":")[0])[0]) for v in block] | |
genosP = [tryint(sep.split(v[idx].split(":")[0])[-1]) for v in block] | |
return "_".join([refalts[i][gm] for i, gm in enumerate(genosM)]),\ | |
"_".join([refalts[i][gp] for i, gp in enumerate(genosP)]) | |
def is_phased(toks): | |
return any("|" in t.split(":")[0] for t in toks[9:]) | |
def n_missing(toks): | |
return sum(t in (".", "./.") for t in toks[9:]) | |
def read_blocks(vcf, min_block_size, max_missing_samples): | |
""" | |
A haplotype block always contain "/", then the following lines in the | |
same block all contain "|". A line with "/" indicates the start of a | |
new block. This function takes a VCF and yields blocks | |
""" | |
fh, header = ts.reader(vcf, header=False), [None] | |
while header[0] != "#CHROM": | |
header = next(fh) | |
header[0] = "CHROM" | |
yield header | |
last_un = None | |
for phased, block in it.groupby(ts.reader(fh, header=False), is_phased): | |
if not phased: | |
last_un = list(block)[-1] | |
continue | |
assert last_un | |
block = [toks for toks in it.chain([last_un], block) if n_missing(toks) <= | |
max_missing_samples] | |
# subtract 1 for row name. | |
if len(block) - 1 < min_block_size: continue | |
last_un = None | |
yield block | |
def send_array(arr, fh=open('/tmp/t.bin', 'w')): | |
fh.seek(0) | |
# send shape as int | |
array.array('l', [len(arr), len(arr[0])]).tofile(fh) | |
# send the data as double | |
array.array('d', [i for row in arr for i in row]).tofile(fh) | |
fh.flush() | |
if __name__ == "__main__": | |
hapskat() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(SKAT) | |
set.seed(1) | |
hap.skat.null = function(formula, covs, ...){ | |
covs = read.delim(covs) | |
assign("y", covs[[as.character(formula[[2]])]], envir = .GlobalEnv) | |
m = SKAT_Null_Model(formula, data=covs, out_type="D") | |
return(m) | |
} | |
hap.skat = function(Z, null.obj, y, wgts=NULL, ...){ | |
r = try(SKAT(Z, null.obj, weights=wgts, is_dosage=TRUE, | |
kernel=ifelse(is.null(wgts), "linear", "linear.weighted"), method="liu")) | |
if(!inherits(r, "try-error")){ | |
res = list(p_value=r$p.value, rho=r$param$rho_est, | |
n_tested=r$param$n.marker.test, | |
p_value_resampling=r$p.value.resampling) | |
} else { | |
res = list(p_value=NaN, rho=NaN, n_tested=NaN, p_value_resampling=NaN) | |
} | |
if("X1" %in% names(null.obj)){ | |
res$f_p_value = ftest(null.obj$X1, Z, y) | |
} else { | |
res$f_p_value = ftest(null.obj$re1$X1, Z, y) | |
} | |
res | |
} | |
ftest = function(X1, Z, y){ | |
# TODO: convert into formula so we can get f.test | |
X1 = data.frame(X1) | |
colnames(Z) = paste0("Z", 1:ncol(Z)) | |
colnames(X1)[1] = "intercept" | |
xnames = colnames(X1) | |
X1$yyy = y | |
m0 = as.formula(sprintf("yyy ~ 0 + %s", paste(xnames, collapse=" + "))) | |
fit0 = glm(m0, data=X1, family=binomial(link=logit)) | |
mz = as.formula(sprintf("yyy ~ 0 + %s + %s", | |
paste(xnames, collapse=" + "), | |
paste(colnames(Z), collapse=" + "))) | |
X2 = cbind(X1, Z) | |
fitz = glm(mz, data=X2, family=binomial(link=logit)) | |
r = anova(fit0, fitz, test="Chisq") | |
return(r[2, "Pr(>Chi)"]) | |
} | |
read.bin = function(bin.file){ | |
conn = file(bin.file, 'rb') | |
mdims = readBin(conn, what=integer(), size=8, n=2) | |
nrow = mdims[1] | |
ncol = mdims[2] | |
dat = readBin(conn, what=numeric(), size=8, n=nrow * ncol) | |
close(conn) | |
matrix(dat, nrow=nrow, ncol=ncol, byrow=TRUE) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment