dustalov/ztest.awk

## ztest.awk
#!/usr/bin/awk -f
BEGIN {
    # significance level
    if (length(ALPHA) == 0) ALPHA = 0.05;

    # standard error estimation method: "basic" or "pooled"
    if (length(SE) == 0) SE = "basic";

    # one-tailed or two-tailed?
    if (TAILS != 2) TAILS = 1;

    PI = atan2(0, -1);
}
{
    ls[NR] = $1; # label
    xs[NR] = $2; # proportion
    ns[NR] = $3; # sample size
}
END {
    for (i = 1; i < NR; i++) {
        for (j = i + 1; j <= NR; j++) {
            # point estimate
            pe = xs[i] - xs[j];

            # standard error
            if (SE == "basic") {
                se = sqrt(xs[i] * (1 - xs[i]) / ns[i] + xs[j] * (1 - xs[j]) / ns[j]);
            } else if (SE == "pooled") {
                # pooled proportion
                pp = (xs[i] * ns[i] + xs[j] * ns[j]) / (ns[i] + ns[j]);
                se = sqrt(pp * (1 - pp) / ns[i] + pp * (1 - pp) / ns[j]);
            } else {
                print "Unknown SE mode." > "/dev/stderr";
                exit 1;
            }

            # Z-score
            z  = pe / se;
            if (z < 0) z = -z;

            # pnorm is the value of the CDF for the normal distribution
            value = z;
            sum   = z;
            for (k = 1; k <= 100; k++) {
                value *= z * z / (2 * k + 1);
                sum   += value;
            }

            if (z > 10) {
                # awk is not so great at math
                pnorm = 0;
            } else {
                # note that P(Z > z) is estimated, not P(Z <= z)
                pnorm = 0.5 - sum / sqrt(2 * PI) * exp(-z**2 / 2);
            }

            pvalue = TAILS * pnorm;

            if (pvalue > 1) {
                print "p-value is computed incorrectly." > "/dev/stderr";
                exit 2;
            }

            print ls[i], ls[j], xs[i], ns[i], xs[j], ns[j], z, sprintf("%.6f", pvalue), pvalue < ALPHA;
        }
    }
}
	#!/usr/bin/awk -f
	BEGIN {
	# significance level
	if (length(ALPHA) == 0) ALPHA = 0.05;

	# standard error estimation method: "basic" or "pooled"
	if (length(SE) == 0) SE = "basic";

	# one-tailed or two-tailed?
	if (TAILS != 2) TAILS = 1;

	PI = atan2(0, -1);
	}
	{
	ls[NR] = $1; # label
	xs[NR] = $2; # proportion
	ns[NR] = $3; # sample size
	}
	END {
	for (i = 1; i < NR; i++) {
	for (j = i + 1; j <= NR; j++) {
	# point estimate
	pe = xs[i] - xs[j];

	# standard error
	if (SE == "basic") {
	se = sqrt(xs[i] * (1 - xs[i]) / ns[i] + xs[j] * (1 - xs[j]) / ns[j]);
	} else if (SE == "pooled") {
	# pooled proportion
	pp = (xs[i] * ns[i] + xs[j] * ns[j]) / (ns[i] + ns[j]);
	se = sqrt(pp * (1 - pp) / ns[i] + pp * (1 - pp) / ns[j]);
	} else {
	print "Unknown SE mode." > "/dev/stderr";
	exit 1;
	}

	# Z-score
	z = pe / se;
	if (z < 0) z = -z;

	# pnorm is the value of the CDF for the normal distribution
	value = z;
	sum = z;
	for (k = 1; k <= 100; k++) {
	value = z z / (2 * k + 1);
	sum += value;
	}

	if (z > 10) {
	# awk is not so great at math
	pnorm = 0;
	} else {
	# note that P(Z > z) is estimated, not P(Z <= z)
	pnorm = 0.5 - sum / sqrt(2 * PI) * exp(-z**2 / 2);
	}

	pvalue = TAILS * pnorm;

	if (pvalue > 1) {
	print "p-value is computed incorrectly." > "/dev/stderr";
	exit 2;
	}

	print ls[i], ls[j], xs[i], ns[i], xs[j], ns[j], z, sprintf("%.6f", pvalue), pvalue < ALPHA;
	}
	}
	}