brentp/af_lim.py

## af_lim.py
from math import sqrt


def jeffreys_interval(af, n_samples, alpha=0.05):
    # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
    """
    >>> jeffreys_interval(0.2, 5) #doctest: +ELLIPSIS
    (0.02251..., 0.6286...)
    >>> jeffreys_interval(0.2, 10) #doctest: +ELLIPSIS
    (0.0440..., 0.5027...)
    >>> jeffreys_interval(0.2, 100) #doctest: +ELLIPSIS
    (0.1307..., 0.2862...)
    >>> jeffreys_interval(0.002, 1000) #doctest: +ELLIPSIS
    (0.000415..., 0.00640...)
    """
    from scipy.stats import beta
    n_success = af * n_samples

    l = beta.ppf(alpha / 2, a=n_success + 0.5, b=n_samples - n_success + 0.5)
    u = beta.ppf(1 - alpha / 2, a=n_success + 0.5, b=n_samples - n_success + 0.5)
    return l, u


def af_lims(af, n_samples, alpha=0.05):
    """
    calculate the confiedence limits (alpha, 1 - alpha) of allele frequency
    (af) from n_samples
    Uses eqn 4 from:
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.408.7107&rep=rep1&type=pdf

    AF of 0.2 with varying samples
    >>> af_lims(0.2, 5) #doctest: +ELLIPSIS
    (0.01052..., 0.70121...)

    >>> af_lims(0.2, 10) #doctest: +ELLIPSIS
    (0.0354..., 0.5578...)

    >>> af_lims(0.2, 100) #doctest: +ELLIPSIS
    (0.1292..., 0.2943...)

    >>> af_lims(0.2, 1000) #doctest: +ELLIPSIS
    (0.17590..., 0.2264...)

    AF of 0.002 with varying samples
    >>> af_lims(0.002, 5) #doctest: +ELLIPSIS
    (0.01775..., 0.5389...)

    >>> af_lims(0.002, 10) #doctest: +ELLIPSIS
    (0.008328..., 0.3470...)

    >>> af_lims(0.002, 100) #doctest: +ELLIPSIS
    (0.000280154..., 0.049524...)

    >>> af_lims(0.002, 1000) #doctest: +ELLIPSIS
    (0.00034..., 0.00803...)
    """

    alpha_to_z = {
        0.1: 1.64485362695,
        0.05: 1.96,
        0.01: 2.57582930355,
        0.001: 3.29052673149,
    }

    p, n = af, n_samples
    z = alpha_to_z[alpha]
    q = 1 - p

    a = 2*n*p + z**2
    denom = 2 * (n + z**2)

    try:
        bL = z * sqrt(z**2 - 2 - 1.0 / n + 4*p*(n*q+1))
        L = (a - 1 - bL) / denom
    except ValueError:
        L = 0.0

    try:
        bU = z * sqrt(z**2 + 2 - 1.0 / n + 4*p*(n*q-1))
        U = (a + 1 + bU) / denom
    except ValueError:
        U = 1.0
    return max(L, 0.0), min(U, 1.0)


if __name__ == "__main__":
    import doctest
    doctest.testmod()

    import sys
    if len(sys.argv) > 1:

        n_exac_samples = 60706
        import sqlite3
        db = sqlite3.connect(sys.argv[1])
        db.row_factory = sqlite3.Row
        cursor = db.cursor()
        cols = ["aaf_adj_exac_afr", "aaf_adj_exac_amr", "aaf_adj_exac_eas", "aaf_adj_exac_nfe", "aaf_adj_exac_sas"]
        try:
            cursor.execute("ALTER table variants ADD COLUMN exac_af_lower_ci_max REAL")
            print "adding 'exac_af_lower_ci_max' column"
        except sqlite3.OperationalError:
            print "modifying existing 'exac_af_lower_ci_max' column"

        maxs = []
        for afs in cursor.execute("select %s from variants" % ", ".join(cols)):
            # [0] pulls the lower CI.
            cis = [af_lims(af, n_exac_samples)[0] for af in afs if af is not None and -1 < af]
            if len(cis) == 0:
                maxs.append(-1)
            else:
                maxs.append(max(cis))

        print "updating database with %d values" % len(maxs)
        cursor.execute("BEGIN TRANSACTION")
        for i, m in enumerate(maxs, start=1):
            cursor.execute("UPDATE variants SET exac_af_lower_ci_max = %f WHERE variant_id = %d" % (m, i))
        db.commit()
        cursor.close()
	from math import sqrt


	def jeffreys_interval(af, n_samples, alpha=0.05):
	# https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
	"""
	>>> jeffreys_interval(0.2, 5) #doctest: +ELLIPSIS
	(0.02251..., 0.6286...)
	>>> jeffreys_interval(0.2, 10) #doctest: +ELLIPSIS
	(0.0440..., 0.5027...)
	>>> jeffreys_interval(0.2, 100) #doctest: +ELLIPSIS
	(0.1307..., 0.2862...)
	>>> jeffreys_interval(0.002, 1000) #doctest: +ELLIPSIS
	(0.000415..., 0.00640...)
	"""
	from scipy.stats import beta
	n_success = af * n_samples

	l = beta.ppf(alpha / 2, a=n_success + 0.5, b=n_samples - n_success + 0.5)
	u = beta.ppf(1 - alpha / 2, a=n_success + 0.5, b=n_samples - n_success + 0.5)
	return l, u



	def af_lims(af, n_samples, alpha=0.05):
	"""
	calculate the confiedence limits (alpha, 1 - alpha) of allele frequency
	(af) from n_samples
	Uses eqn 4 from:
	http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.408.7107&rep=rep1&type=pdf

	AF of 0.2 with varying samples
	>>> af_lims(0.2, 5) #doctest: +ELLIPSIS
	(0.01052..., 0.70121...)

	>>> af_lims(0.2, 10) #doctest: +ELLIPSIS
	(0.0354..., 0.5578...)

	>>> af_lims(0.2, 100) #doctest: +ELLIPSIS
	(0.1292..., 0.2943...)

	>>> af_lims(0.2, 1000) #doctest: +ELLIPSIS
	(0.17590..., 0.2264...)

	AF of 0.002 with varying samples
	>>> af_lims(0.002, 5) #doctest: +ELLIPSIS
	(0.01775..., 0.5389...)

	>>> af_lims(0.002, 10) #doctest: +ELLIPSIS
	(0.008328..., 0.3470...)

	>>> af_lims(0.002, 100) #doctest: +ELLIPSIS
	(0.000280154..., 0.049524...)

	>>> af_lims(0.002, 1000) #doctest: +ELLIPSIS
	(0.00034..., 0.00803...)
	"""

	alpha_to_z = {
	0.1: 1.64485362695,
	0.05: 1.96,
	0.01: 2.57582930355,
	0.001: 3.29052673149,
	}

	p, n = af, n_samples
	z = alpha_to_z[alpha]
	q = 1 - p

	a = 2np + z**2
	denom = 2 * (n + z**2)

	try:
	bL = z * sqrt(z*2 - 2 - 1.0 / n + 4p(nq+1))
	L = (a - 1 - bL) / denom
	except ValueError:
	L = 0.0

	try:
	bU = z * sqrt(z*2 + 2 - 1.0 / n + 4p(nq-1))
	U = (a + 1 + bU) / denom
	except ValueError:
	U = 1.0
	return max(L, 0.0), min(U, 1.0)



	if __name__ == "__main__":
	import doctest
	doctest.testmod()

	import sys
	if len(sys.argv) > 1:

	n_exac_samples = 60706
	import sqlite3
	db = sqlite3.connect(sys.argv[1])
	db.row_factory = sqlite3.Row
	cursor = db.cursor()
	cols = ["aaf_adj_exac_afr", "aaf_adj_exac_amr", "aaf_adj_exac_eas", "aaf_adj_exac_nfe", "aaf_adj_exac_sas"]
	try:
	cursor.execute("ALTER table variants ADD COLUMN exac_af_lower_ci_max REAL")
	print "adding 'exac_af_lower_ci_max' column"
	except sqlite3.OperationalError:
	print "modifying existing 'exac_af_lower_ci_max' column"

	maxs = []
	for afs in cursor.execute("select %s from variants" % ", ".join(cols)):
	# [0] pulls the lower CI.
	cis = [af_lims(af, n_exac_samples)[0] for af in afs if af is not None and -1 < af]
	if len(cis) == 0:
	maxs.append(-1)
	else:
	maxs.append(max(cis))

	print "updating database with %d values" % len(maxs)
	cursor.execute("BEGIN TRANSACTION")
	for i, m in enumerate(maxs, start=1):
	cursor.execute("UPDATE variants SET exac_af_lower_ci_max = %f WHERE variant_id = %d" % (m, i))
	db.commit()
	cursor.close()