Skip to content

Instantly share code, notes, and snippets.

@JohannesBuchner
Last active January 1, 2021 07:54
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JohannesBuchner/442e09b7c77c7150a4885c715eb17e6b to your computer and use it in GitHub Desktop.
Save JohannesBuchner/442e09b7c77c7150a4885c715eb17e6b to your computer and use it in GitHub Desktop.
awk solutions for simple groupby in https://h2oai.github.io/db-benchmark/
# columns: id1,id2,id3,id4,id5,id6,v1,v2,v3
f=G1_1e7_1e2_0_0.csv
awk="time mawk"
# groupby simple
$awk -F, 'NR>1 { a[$1] += $7 } END {for (i in a) print i, a[i]}' $f >/dev/null
$awk -F, 'NR>1 { a[$1,$2] += $7 } END { for (comb in a) { split(comb,sep,SUBSEP); print sep[1], sep[2], a[sep[1],sep[2]]; }}' $f >/dev/null
$awk -F, 'NR>1 { a[$3] += $7; n[$3]++; b[$3] += $9; } END {for (i in a) print i, a[i], b[i]/n[i];}' $f >/dev/null
$awk -F, 'NR>1 { a[$4] += $7; n[$4]++; b[$4] += $8; } END {for (i in a) print i, a[i]/n[i], b[i]/n[i];}' $f >/dev/null
$awk -F, 'NR>1 { a[$6] += $7; b[$6] += $8; c[$6] += $9; } END {for (i in a) print i, a[i], b[i], c[i];}' $f >/dev/null
# groupby advanced
# Q1: median and std:
sort --parallel=10 -k9,9n $f | $awk -F, '
NR>1 {
a[$4,$5,n[$4,$5]++]=$9; sum[$4,$5] += $9; sumsq[$4,$5] += ($9)^2;
} END {
for (comb in a) {
split(comb,sep,SUBSEP);
i=sep[1]; j=sep[2];
s=sqrt((sumsq[i,j]-sum[i,j]^2/n[i,j])/n[i,j]);
if (n[i,j] % 2)
print i, j, a[i,j,int((n[i,j] + 1) / 2)], s;
else
print i, j, (a[i,j,int(n[i,j] / 2)]+a[i,j,int(n[i,j]/2 + 1)]) / 2, s;
}
}'
# Q2: find min & max:
$awk -F, 'NR>1 { if (min[$3]=="" || $8+0<min[$3]) min[$3] = $8+0; if (max[$3]="" || $7+0>max[$3]) max[$3] = $7+0; } END {for (i in min) print i, max[i] - min[i];}' $f #>/dev/null
# Q3: find two largest:
$awk -F, 'NR>1 { if (max[$6] == "" || $9+0>max[$6]) { max2[$6] = max[$6]; max[$6] = $9+0; }; } END {for (i in max) print i, max[i], max2[i];}' $f #>/dev/null
# Q4: correlation
$awk -F, '
NR>1 {
a[$2,$4,n[$2,$4]++]=$9
sum1[$2,$4] += $7
sum2[$2,$4] += $8
sumsq1[$2,$4] += ($7)^2;
sumsq2[$2,$4] += ($8)^2;
c[$2,$4] += ($7 * $8);
} END {
for (comb in a) {
split(comb,sep,SUBSEP);
i=sep[1]; j=sep[2];
s1=sqrt((sumsq1[i,j]-sum1[i,j]^2/n[i,j])/n[i,j]);
s2=sqrt((sumsq2[i,j]-sum2[i,j]^2/n[i,j])/n[i,j]);
corr=(c[i,j]/n[i,j] + sum1[i,j]*sum2[i,j]/n[i,j]^2)/s1/s2;
print i, j, corr;
}
}' $f #>/dev/null
# Q5: sum v3 by id fields
$awk -F, 'NR>1 { a[$1,$2,$3,$4,$5,$6] += $9; n[$1,$2,$3,$4,$5,$6]++ } END { for (comb in a) { split(comb,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], a[comb], n[comb]; }}' $f # >/dev/null
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment