Skip to content

Instantly share code, notes, and snippets.

@dbro
Created April 24, 2014 07:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dbro/11245114 to your computer and use it in GitHub Desktop.
Save dbro/11245114 to your computer and use it in GitHub Desktop.
correlation
#!/usr/bin/awk -f
# input should be parallel sets of numbers, one set on each line, tab-separated.
# the input does not need to be sorted
# non-numeric input anywhere on the line will cause the entire line to be ignored
# this uses a naive algorithm that may lose precision in some situations.
# see http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance for an alternate algorithm
BEGIN {
FS="\t";
OFS=FS;
n=0; # count of rows (which contain a full set of data)
field_count=-1; # invalid, will be initialized by first row
split("", sums); # initialize array. not necessary
rejectctr=0;
exitcode=0; # OK
}
{
if (field_count == -1) {
field_count = NF; # determined by the first row
if (field_count < 1) {
print "ERROR. first row has fewer than 1 column (they should be tab delimited)" > "/dev/stderr";
exitcode = 1;
exit exitcode;
}
}
# first check all the data in the row to see if this row should be rejected
reject_this_row = 0; # false
if (NF != field_count) {
reject_this_row = 1; # true
} else {
for (i=1; i<=field_count; i++) {
v = $i + 0;
if (v != $i) { # test for numeric
reject_this_row = 1; # true
}
}
}
if (reject_this_row == 1) {
rejectedctr++;
} else {
# then update the accumulators
n += 1;
for (i=1; i<=field_count; i++) {
$i;
sums[i] += $i; # single-column accumulator
# pairwise accumulators
for (j=i; j<=field_count; j++) {
sums[i,j] += ($i * $j); # i is always less than or equal to j
}
}
}
}
END {
if (exitcode == 0) {
printf("col1\tcol2\tcorr\n");
for (i=1; i<=field_count; i++) {
var_i = ((sums[i,i] - ((sums[i] * sums[i]) / n)) / n);
std_dev_i = sqrt(var_i);
#print "column " i " variance = " var_i "\tstd_dev = " std_dev_i;
for (j=i+1; j<=field_count; j++) {
var_j = ((sums[j,j] - ((sums[j] * sums[j]) / n)) / n);
std_dev_j = sqrt(var_j);
cov = ((sums[i,j] - ((sums[i] * sums[j]) / n)) / n);
corr = cov / (std_dev_i * std_dev_j);
printf("%d\t%d\t%.3f\n", i, j, corr);
}
}
print n " rows included, and " rejectctr " rows rejected" > "/dev/stderr";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment