Last active
January 1, 2021 07:54
-
-
Save JohannesBuchner/442e09b7c77c7150a4885c715eb17e6b to your computer and use it in GitHub Desktop.
awk solutions for simple groupby in https://h2oai.github.io/db-benchmark/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# columns: id1,id2,id3,id4,id5,id6,v1,v2,v3 | |
f=G1_1e7_1e2_0_0.csv | |
awk="time mawk" | |
# groupby simple | |
$awk -F, 'NR>1 { a[$1] += $7 } END {for (i in a) print i, a[i]}' $f >/dev/null | |
$awk -F, 'NR>1 { a[$1,$2] += $7 } END { for (comb in a) { split(comb,sep,SUBSEP); print sep[1], sep[2], a[sep[1],sep[2]]; }}' $f >/dev/null | |
$awk -F, 'NR>1 { a[$3] += $7; n[$3]++; b[$3] += $9; } END {for (i in a) print i, a[i], b[i]/n[i];}' $f >/dev/null | |
$awk -F, 'NR>1 { a[$4] += $7; n[$4]++; b[$4] += $8; } END {for (i in a) print i, a[i]/n[i], b[i]/n[i];}' $f >/dev/null | |
$awk -F, 'NR>1 { a[$6] += $7; b[$6] += $8; c[$6] += $9; } END {for (i in a) print i, a[i], b[i], c[i];}' $f >/dev/null | |
# groupby advanced | |
# Q1: median and std: | |
sort --parallel=10 -k9,9n $f | $awk -F, ' | |
NR>1 { | |
a[$4,$5,n[$4,$5]++]=$9; sum[$4,$5] += $9; sumsq[$4,$5] += ($9)^2; | |
} END { | |
for (comb in a) { | |
split(comb,sep,SUBSEP); | |
i=sep[1]; j=sep[2]; | |
s=sqrt((sumsq[i,j]-sum[i,j]^2/n[i,j])/n[i,j]); | |
if (n[i,j] % 2) | |
print i, j, a[i,j,int((n[i,j] + 1) / 2)], s; | |
else | |
print i, j, (a[i,j,int(n[i,j] / 2)]+a[i,j,int(n[i,j]/2 + 1)]) / 2, s; | |
} | |
}' | |
# Q2: find min & max: | |
$awk -F, 'NR>1 { if (min[$3]=="" || $8+0<min[$3]) min[$3] = $8+0; if (max[$3]="" || $7+0>max[$3]) max[$3] = $7+0; } END {for (i in min) print i, max[i] - min[i];}' $f #>/dev/null | |
# Q3: find two largest: | |
$awk -F, 'NR>1 { if (max[$6] == "" || $9+0>max[$6]) { max2[$6] = max[$6]; max[$6] = $9+0; }; } END {for (i in max) print i, max[i], max2[i];}' $f #>/dev/null | |
# Q4: correlation | |
$awk -F, ' | |
NR>1 { | |
a[$2,$4,n[$2,$4]++]=$9 | |
sum1[$2,$4] += $7 | |
sum2[$2,$4] += $8 | |
sumsq1[$2,$4] += ($7)^2; | |
sumsq2[$2,$4] += ($8)^2; | |
c[$2,$4] += ($7 * $8); | |
} END { | |
for (comb in a) { | |
split(comb,sep,SUBSEP); | |
i=sep[1]; j=sep[2]; | |
s1=sqrt((sumsq1[i,j]-sum1[i,j]^2/n[i,j])/n[i,j]); | |
s2=sqrt((sumsq2[i,j]-sum2[i,j]^2/n[i,j])/n[i,j]); | |
corr=(c[i,j]/n[i,j] + sum1[i,j]*sum2[i,j]/n[i,j]^2)/s1/s2; | |
print i, j, corr; | |
} | |
}' $f #>/dev/null | |
# Q5: sum v3 by id fields | |
$awk -F, 'NR>1 { a[$1,$2,$3,$4,$5,$6] += $9; n[$1,$2,$3,$4,$5,$6]++ } END { for (comb in a) { split(comb,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], a[comb], n[comb]; }}' $f # >/dev/null |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment