Created
March 8, 2016 16:24
-
-
Save loreb/da844f393bbb132c048c to your computer and use it in GitHub Desktop.
GNU shuf in go, out of boredom during a stupid accident
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// A reimplementation of GNU shuf (think *BSD) | |
// -- there was a shuffle(1) ages ago, but it got deleted. | |
// GNU shuf goes out of its way to optimize anything it can | |
// -- just see its treatment of --random-source! | |
// this is *much* more simple minded - TLDR: use GNU shuf. | |
// Seriously, its inspiration comes from: | |
// 1. shuf on *BSD without installing coreutils; | |
// 2. my router abandoning me for >30 minutes. | |
package main | |
import ( | |
"bufio" | |
"crypto/aes" | |
"crypto/cipher" | |
urandom "crypto/rand" | |
"encoding/binary" | |
"flag" | |
"fmt" | |
"io" | |
"math/rand" | |
"os" | |
) | |
// original usage (for reference) | |
const usage = ` | |
Usage: shuf [OPTION]... [FILE] | |
or: shuf -e [OPTION]... [ARG]... | |
or: shuf -i LO-HI [OPTION]... | |
Write a random permutation of the input lines to standard output. | |
With no FILE, or when FILE is -, read standard input. | |
Mandatory arguments to long options are mandatory for short options too. | |
-e, --echo treat each ARG as an input line | |
-i, --input-range=LO-HI treat each number LO through HI as an input line | |
-n, --head-count=COUNT output at most COUNT lines | |
-o, --output=FILE write result to FILE instead of standard output | |
--random-source=FILE get random bytes from FILE | |
-r, --repeat output lines can be repeated | |
-z, --zero-terminated line delimiter is NUL, not newline | |
--help display this help and exit | |
--version output version information and exit | |
GNU coreutils online help: <http://www.gnu.org/software/coreutils/> | |
Full documentation at: <http://www.gnu.org/software/coreutils/shuf> | |
or available locally via: info '(coreutils) shuf invocation' | |
` | |
const version = "Go reimplementation of GNU shuf 8.25" | |
func main() { | |
eflag := flag.Bool("e", false, "treat each ARG as an input line") | |
iflag := flag.String("i", "", "treat each number LO through HI as an input line") | |
nflag := flag.Int("n", -1, "output at most COUNT lines") | |
oflag := flag.String("o", "", "write result to FILE instead of standard output") | |
random_source := flag.String("random-source", "", "get random bytes from FILE (for tests etc)") | |
rflag := flag.Bool("r", false, "output lines can be repeated") | |
zflag := flag.Bool("z", false, "line delimiter is NUL, not newline") | |
vflag := flag.Bool("version", false, "output version information and exit") | |
// long options aliases; tnx rob pike for this trick. | |
flag.BoolVar(eflag, "echo", false, "alias for -e") | |
flag.StringVar(iflag, "input-range", *iflag, "alias for -i") | |
flag.IntVar(nflag, "head-count", *nflag, "alias for -n") | |
flag.StringVar(oflag, "output", "", "alias for -o") | |
flag.BoolVar(rflag, "repeat", *rflag, "alias for -r") | |
flag.BoolVar(zflag, "zero-terminated", *zflag, "alias for -z") | |
// TODO "--help" prints usage to STDERR and exits NONZERO... | |
flag.Parse() | |
if *vflag { | |
println(version) | |
return | |
} | |
argv := flag.Args() | |
var rng *rand.Rand | |
if len(*random_source) > 0 { | |
rng = rand.New(NewFileRng(*random_source)) | |
} else { | |
rng = rand.New(NewRandom()) | |
} | |
delim := byte('\n') | |
if *zflag { | |
delim = byte(0) | |
} | |
// fmt.Print*() => a write(2) every time! | |
stdout := bufio.NewWriter(os.Stdout) | |
if len(*oflag) != 0 { | |
// os.Stdout is its own filetype for historical reasons; | |
// it really should be just an io.Writer | |
f, err := os.OpenFile(*oflag, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) | |
if err != nil { | |
panic(err) | |
} | |
stdout = bufio.NewWriter(f) | |
} | |
if *eflag && len(*iflag) != 0 { | |
panic("can't use -e and -i") | |
} | |
var lines []string | |
if *eflag { | |
lines = argv | |
if len(lines) == 0 { | |
panic("-e ARGS") | |
} | |
} else if len(*iflag) != 0 { | |
if len(argv) != 0 { | |
panic("-i a-b; no args") | |
} | |
a, b := parserange(*iflag) | |
lines = rangeinclusive(a, b) | |
} else if len(argv) > 0 { | |
if len(argv) != 1 { | |
panic("only one file argument") | |
} | |
f, err := os.Open(argv[0]) | |
if err != nil { | |
panic(err) | |
} | |
lines = slurplines(f, delim) | |
} else { | |
lines = slurplines(os.Stdin, delim) | |
} | |
// finally! | |
if len(lines) == 0 { | |
panic("shuf what?") | |
} | |
for len(lines) > 0 { | |
if *nflag >= 0 { | |
*nflag-- | |
if *nflag < 0 { | |
break | |
} | |
} | |
i := rng.Intn(len(lines)) | |
lines[0], lines[i] = lines[i], lines[0] | |
// 3x faster than fmt.Fprintf()! | |
_, err := stdout.WriteString(lines[0]) | |
if err != nil { | |
panic(err) | |
} | |
err = stdout.WriteByte(delim) | |
if err != nil { | |
panic(err) | |
} | |
if !*rflag { | |
lines = lines[1:] | |
} | |
} | |
err := stdout.Flush() | |
if err != nil { | |
panic(err) | |
} | |
} | |
func slurplines(r io.Reader, delim byte) []string { | |
b := bufio.NewReader(r) | |
var lines []string | |
for { | |
bytes, err := b.ReadBytes(delim) | |
if err != nil { | |
if err == io.EOF { | |
break | |
} | |
panic(err) | |
} | |
// chomp() | |
if bytes[len(bytes)-1] == delim { | |
bytes = bytes[:len(bytes)-1] | |
} | |
lines = append(lines, string(bytes)) | |
} | |
return lines | |
} | |
// parse "1-23" | |
func parserange(s string) (uint64, uint64) { | |
var a, b uint64 | |
var separator byte | |
n, err := fmt.Sscanf(s, "%v%c%v", &a, &separator, &b) | |
if err != nil { | |
// "1" => EOF, the most intuitive diagnostic of the year. | |
panic(fmt.Sprintf("Sscanf(%v) => %v", s, err)) | |
} | |
switch separator { | |
case '-': | |
break | |
case ':': | |
break | |
default: | |
panic("bad separator") | |
} | |
if n != 3 { | |
panic("sscanf") | |
} | |
return a, b | |
} | |
func rangeinclusive(a, b uint64) []string { | |
var str []string | |
for { | |
// string(int) thinks it's a unicode codepoint... | |
str = append(str, fmt.Sprintf("%v", a)) | |
if a == b { | |
break | |
} | |
a++ | |
} | |
return str | |
} | |
// default source: aes/ctr seeded from crypto/rand | |
// math/rand results in shuf being ~20% faster than aes256. | |
// aes 192/256 are equally fast. | |
// UPDATE: after getting rid of Fprintf(), math/rand is 2x faster! | |
// (and almost as fast as GNU/shuf; tested with "ls|shuf -r") | |
// The obvious speedup is to just use a few bytes (we pull 64 bits each time) | |
type rng struct { | |
stream cipher.Stream | |
} | |
func (r *rng) Seed(seed int64) { _ = seed } | |
func (r *rng) Int63() int64 { | |
b := make([]byte, 8) | |
r.stream.XORKeyStream(b, b) | |
return int64(binary.LittleEndian.Uint64(b) >> 1) | |
} | |
func NewRandom() rand.Source { | |
key := make([]byte, 256/8) | |
n, err := urandom.Read(key) | |
if err != nil { | |
panic(err) // maybe use a math/rand... | |
} | |
if n != len(key) { | |
panic("rand.Read borked") | |
} | |
b, err := aes.NewCipher(key) | |
if err != nil { | |
panic(err) | |
} | |
r := new(rng) | |
r.stream = cipher.NewCTR(b, make([]byte, 128/8)) | |
return r | |
} | |
// file source: read bytes from file | |
type filerng struct { | |
name string | |
r io.Reader | |
} | |
func (f *filerng) Seed(seed int64) { _ = seed } | |
func (f *filerng) Int63() int64 { | |
b := make([]byte, 8) | |
n, err := f.r.Read(b) | |
if err != nil { | |
panic(err) | |
} | |
if n != len(b) { | |
panic("short read from " + f.name) | |
} | |
return int64(binary.LittleEndian.Uint64(b) >> 1) | |
} | |
func NewFileRng(filename string) rand.Source { | |
f, err := os.Open(filename) | |
if err != nil { | |
panic(err) | |
} | |
return &filerng{r: bufio.NewReader(f), name: filename} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment