Last active
February 20, 2018 19:11
-
-
Save josherrickson/55515ea17abca89d0a0859188060bfcf to your computer and use it in GitHub Desktop.
Awk script to compute the mean of a csv file's column.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
SEP="," | |
HEADER=true | |
STDERR=false | |
while [ "$#" -gt 0 ]; do | |
case "$1" in | |
-s) SEP="$2"; shift 2;; | |
-c) COL="$2"; shift 2;; | |
-e) STDERR=true; shift 1;; | |
--separator=*) SEP="${1#*=}"; shift 1;; | |
--column=*) COL="${1#*=}"; shift 1;; | |
--no-header) HEADER=false; shift 1;; | |
--standarderror) STDERR=true; shift 1;; | |
--separator|--column) echo "$1 requires an argument" >&2; exit 1;; | |
-*) echo "unknown option: $1" >&2; exit 1;; | |
*) data="$1"; shift 1;; | |
esac | |
done | |
mawk -v FS="$SEP" -v col="$COL" -v header="$HEADER" -v stderr="$STDERR" ' | |
BEGIN { | |
split(col, cols, " ") | |
OFS = "" | |
} | |
{ | |
for(i = 1; i <= length(cols); i++) { | |
colnum = cols[i] | |
if ($colnum != "") { | |
# Count the number of non-missing entries observed so far | |
n[i]++ | |
if (NR == 1) { | |
if (header == "true") { | |
colname[i] = $colnum | |
# If we have a header, start the count one later | |
n[i]-- | |
} else { | |
mean[i] = $colnum | |
} | |
} else if (NR == 2 && header == "true") { | |
mean[i] = $colnum | |
} else { | |
# delta and delta2 are working objects (note that 'mean' | |
# changes between defintions). | |
# mean and s2 are the rolling mean and variance respectively | |
delta = $colnum - mean[i] | |
mean[i] += delta/n[i] | |
delta2 = $colnum - mean[i] | |
s2[i] = s2[i]*(n[i]-2)/(n[i]-1) + delta*delta2/(n[i]-1) | |
} | |
} | |
} | |
} | |
END { | |
if (header == "true") { | |
print NR - 1 " rows observed." | |
} else { | |
print NR " rows observed." | |
} | |
# This loop is manually defined to enforce ordering | |
for(i = 1; i <= length(cols); i++) { | |
if (colname[i] == "") { | |
meanout = "Mean of column " cols[i] " = " mean[i] | |
} else { | |
meanout = "Mean of " colname[i] " = " mean[i] | |
} | |
if (stderr == "true") { | |
sigmaout = " (std err = " sqrt(s2[i]/n[i]) | |
} else { | |
sigmaout = " (std dev = " sqrt(s2[i]) | |
} | |
print meanout sigmaout ", " n[i] " observed records)" | |
} | |
} | |
' $data |
This requires "malk" to be installed (https://invisible-island.net/mawk/, on Mac: brew install mawk
). Change mawk
to awk
on line 24 if you'd prefer to use awk
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Set as an executable:
Run via:
That will compute the mean of the 2nd column of data.csv.
Arguments include:
-c #
or--column=#
, which column to compute the mean of. Can either be a single number (e.g.-c 3
or a space separated quoted list-c "3 6 2 4"
)-s " "
or--separator=" "
, what the separator is (default is-s ","
for CSV).--no-header
, if the data does not contain a header row.-e
or--standarderror
, returns standard error instead of standard deviation.Computed via rolling mean so that very large data will not overrun the limits (e.g. not just computing sum and dividing by n).