Last active
September 4, 2023 17:19
-
-
Save apainintheneck/64156db2fd8b6d1f9d7b2e8059282755 to your computer and use it in GitHub Desktop.
rsample - reservoir sampling script in awk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env awk -f | |
# Slight variation of this script: https://stackoverflow.com/a/48869665 | |
BEGIN { | |
if (k !~ /^[1-9][0-9]*$/) { | |
help() | |
exit(0) | |
} | |
srand(); | |
} | |
NR <= k { | |
reservoir[NR-1] = $0; | |
next; | |
} | |
{ i = int(NR * rand()) } | |
i < k { reservoir[i] = $0 } | |
END { | |
for (i in reservoir) { | |
print reservoir[i]; | |
} | |
} | |
function help() { | |
print "rsample - reservoir sampling" | |
printf "\n" | |
print "Usage: rsample -v k=[INTEGER] [FILE]" | |
printf "\n" | |
print "Description:" | |
print "Use reservoir sampling to collect k lines from a pipe, stdin or" | |
print "a file in O(N) time with N being the total number of lines." | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment