brentp/hapskat.R

## hapskat.py
from __future__ import print_function
from collections import Counter
import toolshed as ts
import itertools as it
import click
import sys
import re
import array
from math import log10
from pyper import R as Rpyper

R = Rpyper(use_numpy=False)
R.use_dict = True
R('source("src/hapskat.R")')

@click.command()
@click.argument('vcf', type=click.Path(exists=True))
@click.argument('cov', type=click.Path(exists=True))
@click.argument('formula')
@click.option('--min-snps', default=4, help="don't test a block unless it"
                " has at least this many variants.")
@click.option('--max-missing', default=10, help="exclude variants with more"
                " missing variants than this.")
def hapskat(vcf, cov, formula, min_snps, max_missing):

    fmt = "{chrom}\t{start}\t{end}\t{chrom}:{start}-{end}\t\t{f_p_value}\t{p_value}\t{n_tested}\t{haplotypes}\t{rare_haplotypes}"
    block_iter = read_blocks(vcf, min_snps, max_missing)
    header = next(block_iter)
    samples = header[9:]
    print(R("null.obj <- hap.skat.null(%s, '%s')" % (formula, cov)),
            file=sys.stderr)
    R('wgts <- NULL')
    print("#" + fmt.replace("}", "").replace("{", ""))
    for i, block in enumerate(block_iter):
        Z, rare_haps = merge_rare(getZ(block, header))
        assert [r[0] for r in Z[1:]] == samples, ("bad order")
        # remove column and row labels
        Zsub = [row[1:] for row in Z[1:]]
        send_array(Zsub)
        #R['wgts'] = wgts
        r = R("res = NA; Z = read.bin('/tmp/t.bin'); res = hap.skat(Z, null.obj, y, wgts)").split("\n")
        r = [x.strip() for x in r if x.strip() and not x.startswith('try')]
        if r:
            print("\n".join(r), file=sys.stderr)
        res = R['res']
        locs = get_locs(block)
        if len(locs) != len(set(locs)):
            print(len(locs), len(set(locs)), file=sys.stderr)
        print(res, file=sys.stderr)
        res['chrom'] = locs[0].split(":")[0]
        res['start'] = locs[0].split(":")[1]
        res['end'] = locs[-1].split(":")[1]
        res['haplotypes'] = "|".join(Z[0][i] for i in range(1, len(Z[0])) if
                not Z[0][i].startswith("rare"))
        res['rare_haplotypes'] = "|".join(rare_haps) or "NA"
        print(fmt.format(**res))

    print(i + 1, "tests", file=sys.stderr)

def merge_rare(Z):
    # Z is row and column headers is of shape n_samples * n_haplotypes.

    # use 1/sqrt(2 * n_samples) as cutoff--from SKAT_CommonRare.
    rare_cutoff = 1.0 / (2.0 * len(Z)) ** 0.5
    col_freqs = [sum(row[i] for row in Z[1:]) / float(len(Z) - 1) for i in range(1, len(Z[0]))]
    rare_idxs = [i for i, f in enumerate(col_freqs) if f < rare_cutoff]
    # since we know that Z has the most common columns first, we can simplify
    # the stuff below to just use the first rare column
    # just check that we don't get, e.g. 2,4,5 instaed of 2,3,4,5
    if True: #rare_idxs == []:
        return Z, []
    assert rare_idxs == range(min(rare_idxs), max(rare_idxs) + 1)

    Znew = [[Z[0][0]] + [Z[0][i + 1] for i in range(rare_idxs[0])] + ['rare_sum']]
    for row in Z[1:]:
        # the row label and the common haplotypes.
        new_row = [row[0]] + [row[i + 1] for i in range(rare_idxs[0])]
        # the merged (summed) rare haplotypes go to a single column
        new_row.append(min(2, sum(row[i + 1] for i in rare_idxs)))
        Znew.append(new_row)

    # return merged data along with the list of rare haplotypes.
    return Znew, [Z[0][i + 1] for i in rare_idxs]

def getZ(block, header):
    sample_haps = []
    sample_haps.extend((get_phases(block, i) for i in range(9, len(header))))
    c = Counter()
    for row in sample_haps:
        c.update(row)
    # TODO: handle haplotypes containing 'X' (genotype of NA within hap)

    Z = [['sample'] + ["%s" % x[0] for x in c.most_common()]]
    for i, row in enumerate(sample_haps):
        # e.g. if it has same hap from both parents then
        # it will have a score of Z for that haplotype
        Z.append([header[9 + i]] + [0.5 * row.count(h) for h in Z[0][1:]])
    return Z

def tryint(x):
    try: return int(x)
    except ValueError: return -1

def get_locs(block):
    return ["%s:%i" % (b[0], int(b[1])) for b in block]

def get_phases(block, idx, sep = re.compile("\||/")):
    """
    use the 0|1 in the genotype column to look up the ref/alt bases
    """
    refalts = [b[3].split(",") + b[4].split(",") + ["X"] for b in block]
    genosM = [tryint(sep.split(v[idx].split(":")[0])[0]) for v in block]
    genosP = [tryint(sep.split(v[idx].split(":")[0])[-1]) for v in block]
    return "_".join([refalts[i][gm] for i, gm in enumerate(genosM)]),\
            "_".join([refalts[i][gp] for i, gp in enumerate(genosP)])

def is_phased(toks):
    return any("|" in t.split(":")[0] for t in toks[9:])

def n_missing(toks):
    return sum(t in (".", "./.") for t in toks[9:])

def read_blocks(vcf, min_block_size, max_missing_samples):
    """
    A haplotype block always contain "/", then the following lines in the
    same block all contain "|". A line with "/" indicates the start of a
    new block. This function takes a VCF and yields blocks
    """

    fh, header = ts.reader(vcf, header=False), [None]
    while header[0] != "#CHROM":
        header = next(fh)
    header[0] = "CHROM"
    yield header

    last_un = None
    for phased, block in it.groupby(ts.reader(fh, header=False), is_phased):
        if not phased:
            last_un = list(block)[-1]
            continue
        assert last_un
        block = [toks for toks in it.chain([last_un], block) if n_missing(toks) <=
                max_missing_samples]
        # subtract 1 for row name.
        if len(block) - 1 < min_block_size: continue
        last_un = None
        yield block

def send_array(arr, fh=open('/tmp/t.bin', 'w')):
    fh.seek(0)
    # send shape as int
    array.array('l', [len(arr), len(arr[0])]).tofile(fh)
    # send the data as double
    array.array('d', [i  for row in arr for i in row]).tofile(fh)
    fh.flush()

if __name__ == "__main__":
    hapskat()

## hapskat.R
library(SKAT)
set.seed(1)

hap.skat.null = function(formula, covs, ...){
    covs = read.delim(covs)
    assign("y", covs[[as.character(formula[[2]])]], envir = .GlobalEnv)
    m = SKAT_Null_Model(formula, data=covs, out_type="D")
    return(m)
}

hap.skat = function(Z, null.obj, y, wgts=NULL, ...){

    r = try(SKAT(Z, null.obj, weights=wgts, is_dosage=TRUE,
         kernel=ifelse(is.null(wgts), "linear", "linear.weighted"), method="liu"))
    if(!inherits(r, "try-error")){
        res = list(p_value=r$p.value, rho=r$param$rho_est,
                   n_tested=r$param$n.marker.test,
                   p_value_resampling=r$p.value.resampling)
    } else {
        res = list(p_value=NaN, rho=NaN, n_tested=NaN, p_value_resampling=NaN)
    }
    if("X1" %in% names(null.obj)){
        res$f_p_value = ftest(null.obj$X1, Z, y)
    } else {
        res$f_p_value = ftest(null.obj$re1$X1, Z, y)
    }
    res
}


ftest = function(X1, Z, y){
    # TODO: convert into formula so we can get f.test
    X1 = data.frame(X1)
    colnames(Z) = paste0("Z", 1:ncol(Z))
    colnames(X1)[1] = "intercept"
    xnames = colnames(X1)
    X1$yyy = y
    m0 = as.formula(sprintf("yyy ~ 0 + %s", paste(xnames, collapse=" + ")))
    fit0 = glm(m0, data=X1, family=binomial(link=logit))

    mz = as.formula(sprintf("yyy ~ 0 + %s + %s",
                     paste(xnames, collapse=" + "),
                     paste(colnames(Z), collapse=" + ")))
    X2 = cbind(X1, Z)
    fitz = glm(mz, data=X2, family=binomial(link=logit))
    r = anova(fit0, fitz, test="Chisq")
    return(r[2, "Pr(>Chi)"])
}

read.bin = function(bin.file){
    conn = file(bin.file, 'rb')
    mdims = readBin(conn, what=integer(), size=8, n=2)
    nrow = mdims[1]
    ncol = mdims[2]
    dat = readBin(conn, what=numeric(), size=8, n=nrow * ncol)
    close(conn)
    matrix(dat, nrow=nrow, ncol=ncol, byrow=TRUE)
}
	from __future__ import print_function
	from collections import Counter
	import toolshed as ts
	import itertools as it
	import click
	import sys
	import re
	import array
	from math import log10
	from pyper import R as Rpyper

	R = Rpyper(use_numpy=False)
	R.use_dict = True
	R('source("src/hapskat.R")')

	@click.command()
	@click.argument('vcf', type=click.Path(exists=True))
	@click.argument('cov', type=click.Path(exists=True))
	@click.argument('formula')
	@click.option('--min-snps', default=4, help="don't test a block unless it"
	" has at least this many variants.")
	@click.option('--max-missing', default=10, help="exclude variants with more"
	" missing variants than this.")
	def hapskat(vcf, cov, formula, min_snps, max_missing):

	fmt = "{chrom}\t{start}\t{end}\t{chrom}:{start}-{end}\t\t{f_p_value}\t{p_value}\t{n_tested}\t{haplotypes}\t{rare_haplotypes}"
	block_iter = read_blocks(vcf, min_snps, max_missing)
	header = next(block_iter)
	samples = header[9:]
	print(R("null.obj <- hap.skat.null(%s, '%s')" % (formula, cov)),
	file=sys.stderr)
	R('wgts <- NULL')
	print("#" + fmt.replace("}", "").replace("{", ""))
	for i, block in enumerate(block_iter):
	Z, rare_haps = merge_rare(getZ(block, header))
	assert [r[0] for r in Z[1:]] == samples, ("bad order")
	# remove column and row labels
	Zsub = [row[1:] for row in Z[1:]]
	send_array(Zsub)
	#R['wgts'] = wgts
	r = R("res = NA; Z = read.bin('/tmp/t.bin'); res = hap.skat(Z, null.obj, y, wgts)").split("\n")
	r = [x.strip() for x in r if x.strip() and not x.startswith('try')]
	if r:
	print("\n".join(r), file=sys.stderr)
	res = R['res']
	locs = get_locs(block)
	if len(locs) != len(set(locs)):
	print(len(locs), len(set(locs)), file=sys.stderr)
	print(res, file=sys.stderr)
	res['chrom'] = locs[0].split(":")[0]
	res['start'] = locs[0].split(":")[1]
	res['end'] = locs[-1].split(":")[1]
	res['haplotypes'] = "\|".join(Z[0][i] for i in range(1, len(Z[0])) if
	not Z[0][i].startswith("rare"))
	res['rare_haplotypes'] = "\|".join(rare_haps) or "NA"
	print(fmt.format(**res))

	print(i + 1, "tests", file=sys.stderr)

	def merge_rare(Z):
	# Z is row and column headers is of shape n_samples * n_haplotypes.

	# use 1/sqrt(2 * n_samples) as cutoff--from SKAT_CommonRare.
	rare_cutoff = 1.0 / (2.0 * len(Z)) ** 0.5
	col_freqs = [sum(row[i] for row in Z[1:]) / float(len(Z) - 1) for i in range(1, len(Z[0]))]
	rare_idxs = [i for i, f in enumerate(col_freqs) if f < rare_cutoff]
	# since we know that Z has the most common columns first, we can simplify
	# the stuff below to just use the first rare column
	# just check that we don't get, e.g. 2,4,5 instaed of 2,3,4,5
	if True: #rare_idxs == []:
	return Z, []
	assert rare_idxs == range(min(rare_idxs), max(rare_idxs) + 1)

	Znew = [[Z[0][0]] + [Z[0][i + 1] for i in range(rare_idxs[0])] + ['rare_sum']]
	for row in Z[1:]:
	# the row label and the common haplotypes.
	new_row = [row[0]] + [row[i + 1] for i in range(rare_idxs[0])]
	# the merged (summed) rare haplotypes go to a single column
	new_row.append(min(2, sum(row[i + 1] for i in rare_idxs)))
	Znew.append(new_row)

	# return merged data along with the list of rare haplotypes.
	return Znew, [Z[0][i + 1] for i in rare_idxs]

	def getZ(block, header):
	sample_haps = []
	sample_haps.extend((get_phases(block, i) for i in range(9, len(header))))
	c = Counter()
	for row in sample_haps:
	c.update(row)
	# TODO: handle haplotypes containing 'X' (genotype of NA within hap)

	Z = [['sample'] + ["%s" % x[0] for x in c.most_common()]]
	for i, row in enumerate(sample_haps):
	# e.g. if it has same hap from both parents then
	# it will have a score of Z for that haplotype
	Z.append([header[9 + i]] + [0.5 * row.count(h) for h in Z[0][1:]])
	return Z

	def tryint(x):
	try: return int(x)
	except ValueError: return -1

	def get_locs(block):
	return ["%s:%i" % (b[0], int(b[1])) for b in block]

	def get_phases(block, idx, sep = re.compile("\\|\|/")):
	"""
	use the 0\|1 in the genotype column to look up the ref/alt bases
	"""
	refalts = [b[3].split(",") + b[4].split(",") + ["X"] for b in block]
	genosM = [tryint(sep.split(v[idx].split(":")[0])[0]) for v in block]
	genosP = [tryint(sep.split(v[idx].split(":")[0])[-1]) for v in block]
	return "_".join([refalts[i][gm] for i, gm in enumerate(genosM)]),\
	"_".join([refalts[i][gp] for i, gp in enumerate(genosP)])

	def is_phased(toks):
	return any("\|" in t.split(":")[0] for t in toks[9:])

	def n_missing(toks):
	return sum(t in (".", "./.") for t in toks[9:])

	def read_blocks(vcf, min_block_size, max_missing_samples):
	"""
	A haplotype block always contain "/", then the following lines in the
	same block all contain "\|". A line with "/" indicates the start of a
	new block. This function takes a VCF and yields blocks
	"""

	fh, header = ts.reader(vcf, header=False), [None]
	while header[0] != "#CHROM":
	header = next(fh)
	header[0] = "CHROM"
	yield header

	last_un = None
	for phased, block in it.groupby(ts.reader(fh, header=False), is_phased):
	if not phased:
	last_un = list(block)[-1]
	continue
	assert last_un
	block = [toks for toks in it.chain([last_un], block) if n_missing(toks) <=
	max_missing_samples]
	# subtract 1 for row name.
	if len(block) - 1 < min_block_size: continue
	last_un = None
	yield block

	def send_array(arr, fh=open('/tmp/t.bin', 'w')):
	fh.seek(0)
	# send shape as int
	array.array('l', [len(arr), len(arr[0])]).tofile(fh)
	# send the data as double
	array.array('d', [i for row in arr for i in row]).tofile(fh)
	fh.flush()

	if __name__ == "__main__":
	hapskat()
	library(SKAT)
	set.seed(1)

	hap.skat.null = function(formula, covs, ...){
	covs = read.delim(covs)
	assign("y", covs[[as.character(formula[[2]])]], envir = .GlobalEnv)
	m = SKAT_Null_Model(formula, data=covs, out_type="D")
	return(m)
	}

	hap.skat = function(Z, null.obj, y, wgts=NULL, ...){

	r = try(SKAT(Z, null.obj, weights=wgts, is_dosage=TRUE,
	kernel=ifelse(is.null(wgts), "linear", "linear.weighted"), method="liu"))
	if(!inherits(r, "try-error")){
	res = list(p_value=r$p.value, rho=r$param$rho_est,
	n_tested=r$param$n.marker.test,
	p_value_resampling=r$p.value.resampling)
	} else {
	res = list(p_value=NaN, rho=NaN, n_tested=NaN, p_value_resampling=NaN)
	}
	if("X1" %in% names(null.obj)){
	res$f_p_value = ftest(null.obj$X1, Z, y)
	} else {
	res$f_p_value = ftest(null.obj$re1$X1, Z, y)
	}
	res
	}


	ftest = function(X1, Z, y){
	# TODO: convert into formula so we can get f.test
	X1 = data.frame(X1)
	colnames(Z) = paste0("Z", 1:ncol(Z))
	colnames(X1)[1] = "intercept"
	xnames = colnames(X1)
	X1$yyy = y
	m0 = as.formula(sprintf("yyy ~ 0 + %s", paste(xnames, collapse=" + ")))
	fit0 = glm(m0, data=X1, family=binomial(link=logit))

	mz = as.formula(sprintf("yyy ~ 0 + %s + %s",
	paste(xnames, collapse=" + "),
	paste(colnames(Z), collapse=" + ")))
	X2 = cbind(X1, Z)
	fitz = glm(mz, data=X2, family=binomial(link=logit))
	r = anova(fit0, fitz, test="Chisq")
	return(r[2, "Pr(>Chi)"])
	}

	read.bin = function(bin.file){
	conn = file(bin.file, 'rb')
	mdims = readBin(conn, what=integer(), size=8, n=2)
	nrow = mdims[1]
	ncol = mdims[2]
	dat = readBin(conn, what=numeric(), size=8, n=nrow * ncol)
	close(conn)
	matrix(dat, nrow=nrow, ncol=ncol, byrow=TRUE)
	}