Skip to content

Instantly share code, notes, and snippets.

@loreb
Created March 8, 2016 16:24
Show Gist options
  • Save loreb/da844f393bbb132c048c to your computer and use it in GitHub Desktop.
Save loreb/da844f393bbb132c048c to your computer and use it in GitHub Desktop.
GNU shuf in go, out of boredom during a stupid accident
// A reimplementation of GNU shuf (think *BSD)
// -- there was a shuffle(1) ages ago, but it got deleted.
// GNU shuf goes out of its way to optimize anything it can
// -- just see its treatment of --random-source!
// this is *much* more simple minded - TLDR: use GNU shuf.
// Seriously, its inspiration comes from:
// 1. shuf on *BSD without installing coreutils;
// 2. my router abandoning me for >30 minutes.
package main
import (
"bufio"
"crypto/aes"
"crypto/cipher"
urandom "crypto/rand"
"encoding/binary"
"flag"
"fmt"
"io"
"math/rand"
"os"
)
// original usage (for reference)
const usage = `
Usage: shuf [OPTION]... [FILE]
or: shuf -e [OPTION]... [ARG]...
or: shuf -i LO-HI [OPTION]...
Write a random permutation of the input lines to standard output.
With no FILE, or when FILE is -, read standard input.
Mandatory arguments to long options are mandatory for short options too.
-e, --echo treat each ARG as an input line
-i, --input-range=LO-HI treat each number LO through HI as an input line
-n, --head-count=COUNT output at most COUNT lines
-o, --output=FILE write result to FILE instead of standard output
--random-source=FILE get random bytes from FILE
-r, --repeat output lines can be repeated
-z, --zero-terminated line delimiter is NUL, not newline
--help display this help and exit
--version output version information and exit
GNU coreutils online help: <http://www.gnu.org/software/coreutils/>
Full documentation at: <http://www.gnu.org/software/coreutils/shuf>
or available locally via: info '(coreutils) shuf invocation'
`
const version = "Go reimplementation of GNU shuf 8.25"
func main() {
eflag := flag.Bool("e", false, "treat each ARG as an input line")
iflag := flag.String("i", "", "treat each number LO through HI as an input line")
nflag := flag.Int("n", -1, "output at most COUNT lines")
oflag := flag.String("o", "", "write result to FILE instead of standard output")
random_source := flag.String("random-source", "", "get random bytes from FILE (for tests etc)")
rflag := flag.Bool("r", false, "output lines can be repeated")
zflag := flag.Bool("z", false, "line delimiter is NUL, not newline")
vflag := flag.Bool("version", false, "output version information and exit")
// long options aliases; tnx rob pike for this trick.
flag.BoolVar(eflag, "echo", false, "alias for -e")
flag.StringVar(iflag, "input-range", *iflag, "alias for -i")
flag.IntVar(nflag, "head-count", *nflag, "alias for -n")
flag.StringVar(oflag, "output", "", "alias for -o")
flag.BoolVar(rflag, "repeat", *rflag, "alias for -r")
flag.BoolVar(zflag, "zero-terminated", *zflag, "alias for -z")
// TODO "--help" prints usage to STDERR and exits NONZERO...
flag.Parse()
if *vflag {
println(version)
return
}
argv := flag.Args()
var rng *rand.Rand
if len(*random_source) > 0 {
rng = rand.New(NewFileRng(*random_source))
} else {
rng = rand.New(NewRandom())
}
delim := byte('\n')
if *zflag {
delim = byte(0)
}
// fmt.Print*() => a write(2) every time!
stdout := bufio.NewWriter(os.Stdout)
if len(*oflag) != 0 {
// os.Stdout is its own filetype for historical reasons;
// it really should be just an io.Writer
f, err := os.OpenFile(*oflag, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666)
if err != nil {
panic(err)
}
stdout = bufio.NewWriter(f)
}
if *eflag && len(*iflag) != 0 {
panic("can't use -e and -i")
}
var lines []string
if *eflag {
lines = argv
if len(lines) == 0 {
panic("-e ARGS")
}
} else if len(*iflag) != 0 {
if len(argv) != 0 {
panic("-i a-b; no args")
}
a, b := parserange(*iflag)
lines = rangeinclusive(a, b)
} else if len(argv) > 0 {
if len(argv) != 1 {
panic("only one file argument")
}
f, err := os.Open(argv[0])
if err != nil {
panic(err)
}
lines = slurplines(f, delim)
} else {
lines = slurplines(os.Stdin, delim)
}
// finally!
if len(lines) == 0 {
panic("shuf what?")
}
for len(lines) > 0 {
if *nflag >= 0 {
*nflag--
if *nflag < 0 {
break
}
}
i := rng.Intn(len(lines))
lines[0], lines[i] = lines[i], lines[0]
// 3x faster than fmt.Fprintf()!
_, err := stdout.WriteString(lines[0])
if err != nil {
panic(err)
}
err = stdout.WriteByte(delim)
if err != nil {
panic(err)
}
if !*rflag {
lines = lines[1:]
}
}
err := stdout.Flush()
if err != nil {
panic(err)
}
}
func slurplines(r io.Reader, delim byte) []string {
b := bufio.NewReader(r)
var lines []string
for {
bytes, err := b.ReadBytes(delim)
if err != nil {
if err == io.EOF {
break
}
panic(err)
}
// chomp()
if bytes[len(bytes)-1] == delim {
bytes = bytes[:len(bytes)-1]
}
lines = append(lines, string(bytes))
}
return lines
}
// parse "1-23"
func parserange(s string) (uint64, uint64) {
var a, b uint64
var separator byte
n, err := fmt.Sscanf(s, "%v%c%v", &a, &separator, &b)
if err != nil {
// "1" => EOF, the most intuitive diagnostic of the year.
panic(fmt.Sprintf("Sscanf(%v) => %v", s, err))
}
switch separator {
case '-':
break
case ':':
break
default:
panic("bad separator")
}
if n != 3 {
panic("sscanf")
}
return a, b
}
func rangeinclusive(a, b uint64) []string {
var str []string
for {
// string(int) thinks it's a unicode codepoint...
str = append(str, fmt.Sprintf("%v", a))
if a == b {
break
}
a++
}
return str
}
// default source: aes/ctr seeded from crypto/rand
// math/rand results in shuf being ~20% faster than aes256.
// aes 192/256 are equally fast.
// UPDATE: after getting rid of Fprintf(), math/rand is 2x faster!
// (and almost as fast as GNU/shuf; tested with "ls|shuf -r")
// The obvious speedup is to just use a few bytes (we pull 64 bits each time)
type rng struct {
stream cipher.Stream
}
func (r *rng) Seed(seed int64) { _ = seed }
func (r *rng) Int63() int64 {
b := make([]byte, 8)
r.stream.XORKeyStream(b, b)
return int64(binary.LittleEndian.Uint64(b) >> 1)
}
func NewRandom() rand.Source {
key := make([]byte, 256/8)
n, err := urandom.Read(key)
if err != nil {
panic(err) // maybe use a math/rand...
}
if n != len(key) {
panic("rand.Read borked")
}
b, err := aes.NewCipher(key)
if err != nil {
panic(err)
}
r := new(rng)
r.stream = cipher.NewCTR(b, make([]byte, 128/8))
return r
}
// file source: read bytes from file
type filerng struct {
name string
r io.Reader
}
func (f *filerng) Seed(seed int64) { _ = seed }
func (f *filerng) Int63() int64 {
b := make([]byte, 8)
n, err := f.r.Read(b)
if err != nil {
panic(err)
}
if n != len(b) {
panic("short read from " + f.name)
}
return int64(binary.LittleEndian.Uint64(b) >> 1)
}
func NewFileRng(filename string) rand.Source {
f, err := os.Open(filename)
if err != nil {
panic(err)
}
return &filerng{r: bufio.NewReader(f), name: filename}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment