loreb/shuf.go

## shuf.go
// A reimplementation of GNU shuf (think *BSD)
// -- there was a shuffle(1) ages ago, but it got deleted.
// GNU shuf goes out of its way to optimize anything it can
// -- just see its treatment of --random-source!
// this is *much* more simple minded - TLDR: use GNU shuf.
// Seriously, its inspiration comes from:
// 1. shuf on *BSD without installing coreutils;
// 2. my router abandoning me for >30 minutes.
package main

import (
	"bufio"
	"crypto/aes"
	"crypto/cipher"
	urandom "crypto/rand"
	"encoding/binary"
	"flag"
	"fmt"
	"io"
	"math/rand"
	"os"
)

// original usage (for reference)
const usage = `
Usage: shuf [OPTION]... [FILE]
  or:  shuf -e [OPTION]... [ARG]...
  or:  shuf -i LO-HI [OPTION]...
Write a random permutation of the input lines to standard output.

With no FILE, or when FILE is -, read standard input.

Mandatory arguments to long options are mandatory for short options too.
  -e, --echo                treat each ARG as an input line
  -i, --input-range=LO-HI   treat each number LO through HI as an input line
  -n, --head-count=COUNT    output at most COUNT lines
  -o, --output=FILE         write result to FILE instead of standard output
      --random-source=FILE  get random bytes from FILE
  -r, --repeat              output lines can be repeated
  -z, --zero-terminated     line delimiter is NUL, not newline
      --help     display this help and exit
      --version  output version information and exit

GNU coreutils online help: <http://www.gnu.org/software/coreutils/>
Full documentation at: <http://www.gnu.org/software/coreutils/shuf>
or available locally via: info '(coreutils) shuf invocation'
`
const version = "Go reimplementation of GNU shuf 8.25"

func main() {
	eflag := flag.Bool("e", false, "treat each ARG as an input line")
	iflag := flag.String("i", "", "treat each number LO through HI as an input line")
	nflag := flag.Int("n", -1, "output at most COUNT lines")
	oflag := flag.String("o", "", "write result to FILE instead of standard output")
	random_source := flag.String("random-source", "", "get random bytes from FILE (for tests etc)")
	rflag := flag.Bool("r", false, "output lines can be repeated")
	zflag := flag.Bool("z", false, "line delimiter is NUL, not newline")
	vflag := flag.Bool("version", false, "output version information and exit")
	// long options aliases; tnx rob pike for this trick.
	flag.BoolVar(eflag, "echo", false, "alias for -e")
	flag.StringVar(iflag, "input-range", *iflag, "alias for -i")
	flag.IntVar(nflag, "head-count", *nflag, "alias for -n")
	flag.StringVar(oflag, "output", "", "alias for -o")
	flag.BoolVar(rflag, "repeat", *rflag, "alias for -r")
	flag.BoolVar(zflag, "zero-terminated", *zflag, "alias for -z")
	// TODO "--help" prints usage to STDERR and exits NONZERO...
	flag.Parse()
	if *vflag {
		println(version)
		return
	}
	argv := flag.Args()
	var rng *rand.Rand
	if len(*random_source) > 0 {
		rng = rand.New(NewFileRng(*random_source))
	} else {
		rng = rand.New(NewRandom())
	}
	delim := byte('\n')
	if *zflag {
		delim = byte(0)
	}
	// fmt.Print*() => a write(2) every time!
	stdout := bufio.NewWriter(os.Stdout)
	if len(*oflag) != 0 {
		// os.Stdout is its own filetype for historical reasons;
		// it really should be just an io.Writer
		f, err := os.OpenFile(*oflag, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666)
		if err != nil {
			panic(err)
		}
		stdout = bufio.NewWriter(f)
	}
	if *eflag && len(*iflag) != 0 {
		panic("can't use -e and -i")
	}
	var lines []string
	if *eflag {
		lines = argv
		if len(lines) == 0 {
			panic("-e ARGS")
		}
	} else if len(*iflag) != 0 {
		if len(argv) != 0 {
			panic("-i a-b; no args")
		}
		a, b := parserange(*iflag)
		lines = rangeinclusive(a, b)
	} else if len(argv) > 0 {
		if len(argv) != 1 {
			panic("only one file argument")
		}
		f, err := os.Open(argv[0])
		if err != nil {
			panic(err)
		}
		lines = slurplines(f, delim)
	} else {
		lines = slurplines(os.Stdin, delim)
	}
	// finally!
	if len(lines) == 0 {
		panic("shuf what?")
	}
	for len(lines) > 0 {
		if *nflag >= 0 {
			*nflag--
			if *nflag < 0 {
				break
			}
		}
		i := rng.Intn(len(lines))
		lines[0], lines[i] = lines[i], lines[0]
		// 3x faster than fmt.Fprintf()!
		_, err := stdout.WriteString(lines[0])
		if err != nil {
			panic(err)
		}
		err = stdout.WriteByte(delim)
		if err != nil {
			panic(err)
		}
		if !*rflag {
			lines = lines[1:]
		}
	}
	err := stdout.Flush()
	if err != nil {
		panic(err)
	}
}

func slurplines(r io.Reader, delim byte) []string {
	b := bufio.NewReader(r)
	var lines []string
	for {
		bytes, err := b.ReadBytes(delim)
		if err != nil {
			if err == io.EOF {
				break
			}
			panic(err)
		}
		// chomp()
		if bytes[len(bytes)-1] == delim {
			bytes = bytes[:len(bytes)-1]
		}
		lines = append(lines, string(bytes))
	}
	return lines
}

// parse "1-23"
func parserange(s string) (uint64, uint64) {
	var a, b uint64
	var separator byte
	n, err := fmt.Sscanf(s, "%v%c%v", &a, &separator, &b)
	if err != nil {
		// "1" => EOF, the most intuitive diagnostic of the year.
		panic(fmt.Sprintf("Sscanf(%v) => %v", s, err))
	}
	switch separator {
	case '-':
		break
	case ':':
		break
	default:
		panic("bad separator")
	}
	if n != 3 {
		panic("sscanf")
	}
	return a, b
}

func rangeinclusive(a, b uint64) []string {
	var str []string
	for {
		// string(int) thinks it's a unicode codepoint...
		str = append(str, fmt.Sprintf("%v", a))
		if a == b {
			break
		}
		a++
	}
	return str
}

// default source: aes/ctr seeded from crypto/rand
// math/rand results in shuf being ~20% faster than aes256.
// aes 192/256 are equally fast.
// UPDATE: after getting rid of Fprintf(), math/rand is 2x faster!
// (and almost as fast as GNU/shuf; tested with "ls|shuf -r")
// The obvious speedup is to just use a few bytes (we pull 64 bits each time)
type rng struct {
	stream cipher.Stream
}

func (r *rng) Seed(seed int64) { _ = seed }
func (r *rng) Int63() int64 {
	b := make([]byte, 8)
	r.stream.XORKeyStream(b, b)
	return int64(binary.LittleEndian.Uint64(b) >> 1)
}
func NewRandom() rand.Source {
	key := make([]byte, 256/8)
	n, err := urandom.Read(key)
	if err != nil {
		panic(err) // maybe use a math/rand...
	}
	if n != len(key) {
		panic("rand.Read borked")
	}
	b, err := aes.NewCipher(key)
	if err != nil {
		panic(err)
	}
	r := new(rng)
	r.stream = cipher.NewCTR(b, make([]byte, 128/8))
	return r
}

// file source: read bytes from file
type filerng struct {
	name string
	r    io.Reader
}

func (f *filerng) Seed(seed int64) { _ = seed }
func (f *filerng) Int63() int64 {
	b := make([]byte, 8)
	n, err := f.r.Read(b)
	if err != nil {
		panic(err)
	}
	if n != len(b) {
		panic("short read from " + f.name)
	}
	return int64(binary.LittleEndian.Uint64(b) >> 1)
}
func NewFileRng(filename string) rand.Source {
	f, err := os.Open(filename)
	if err != nil {
		panic(err)
	}
	return &filerng{r: bufio.NewReader(f), name: filename}
}
	// A reimplementation of GNU shuf (think *BSD)
	// -- there was a shuffle(1) ages ago, but it got deleted.
	// GNU shuf goes out of its way to optimize anything it can
	// -- just see its treatment of --random-source!
	// this is much more simple minded - TLDR: use GNU shuf.
	// Seriously, its inspiration comes from:
	// 1. shuf on *BSD without installing coreutils;
	// 2. my router abandoning me for >30 minutes.
	package main

	import (
	"bufio"
	"crypto/aes"
	"crypto/cipher"
	urandom "crypto/rand"
	"encoding/binary"
	"flag"
	"fmt"
	"io"
	"math/rand"
	"os"
	)

	// original usage (for reference)
	const usage = `
	Usage: shuf [OPTION]... [FILE]
	or: shuf -e [OPTION]... [ARG]...
	or: shuf -i LO-HI [OPTION]...
	Write a random permutation of the input lines to standard output.

	With no FILE, or when FILE is -, read standard input.

	Mandatory arguments to long options are mandatory for short options too.
	-e, --echo treat each ARG as an input line
	-i, --input-range=LO-HI treat each number LO through HI as an input line
	-n, --head-count=COUNT output at most COUNT lines
	-o, --output=FILE write result to FILE instead of standard output
	--random-source=FILE get random bytes from FILE
	-r, --repeat output lines can be repeated
	-z, --zero-terminated line delimiter is NUL, not newline
	--help display this help and exit
	--version output version information and exit

	GNU coreutils online help: <http://www.gnu.org/software/coreutils/>
	Full documentation at: <http://www.gnu.org/software/coreutils/shuf>
	or available locally via: info '(coreutils) shuf invocation'
	`
	const version = "Go reimplementation of GNU shuf 8.25"

	func main() {
	eflag := flag.Bool("e", false, "treat each ARG as an input line")
	iflag := flag.String("i", "", "treat each number LO through HI as an input line")
	nflag := flag.Int("n", -1, "output at most COUNT lines")
	oflag := flag.String("o", "", "write result to FILE instead of standard output")
	random_source := flag.String("random-source", "", "get random bytes from FILE (for tests etc)")
	rflag := flag.Bool("r", false, "output lines can be repeated")
	zflag := flag.Bool("z", false, "line delimiter is NUL, not newline")
	vflag := flag.Bool("version", false, "output version information and exit")
	// long options aliases; tnx rob pike for this trick.
	flag.BoolVar(eflag, "echo", false, "alias for -e")
	flag.StringVar(iflag, "input-range", *iflag, "alias for -i")
	flag.IntVar(nflag, "head-count", *nflag, "alias for -n")
	flag.StringVar(oflag, "output", "", "alias for -o")
	flag.BoolVar(rflag, "repeat", *rflag, "alias for -r")
	flag.BoolVar(zflag, "zero-terminated", *zflag, "alias for -z")
	// TODO "--help" prints usage to STDERR and exits NONZERO...
	flag.Parse()
	if *vflag {
	println(version)
	return
	}
	argv := flag.Args()
	var rng *rand.Rand
	if len(*random_source) > 0 {
	rng = rand.New(NewFileRng(*random_source))
	} else {
	rng = rand.New(NewRandom())
	}
	delim := byte('\n')
	if *zflag {
	delim = byte(0)
	}
	// fmt.Print*() => a write(2) every time!
	stdout := bufio.NewWriter(os.Stdout)
	if len(*oflag) != 0 {
	// os.Stdout is its own filetype for historical reasons;
	// it really should be just an io.Writer
	f, err := os.OpenFile(*oflag, os.O_WRONLY\|os.O_TRUNC\|os.O_CREATE, 0666)
	if err != nil {
	panic(err)
	}
	stdout = bufio.NewWriter(f)
	}
	if eflag && len(iflag) != 0 {
	panic("can't use -e and -i")
	}
	var lines []string
	if *eflag {
	lines = argv
	if len(lines) == 0 {
	panic("-e ARGS")
	}
	} else if len(*iflag) != 0 {
	if len(argv) != 0 {
	panic("-i a-b; no args")
	}
	a, b := parserange(*iflag)
	lines = rangeinclusive(a, b)
	} else if len(argv) > 0 {
	if len(argv) != 1 {
	panic("only one file argument")
	}
	f, err := os.Open(argv[0])
	if err != nil {
	panic(err)
	}
	lines = slurplines(f, delim)
	} else {
	lines = slurplines(os.Stdin, delim)
	}
	// finally!
	if len(lines) == 0 {
	panic("shuf what?")
	}
	for len(lines) > 0 {
	if *nflag >= 0 {
	*nflag--
	if *nflag < 0 {
	break
	}
	}
	i := rng.Intn(len(lines))
	lines[0], lines[i] = lines[i], lines[0]
	// 3x faster than fmt.Fprintf()!
	_, err := stdout.WriteString(lines[0])
	if err != nil {
	panic(err)
	}
	err = stdout.WriteByte(delim)
	if err != nil {
	panic(err)
	}
	if !*rflag {
	lines = lines[1:]
	}
	}
	err := stdout.Flush()
	if err != nil {
	panic(err)
	}
	}

	func slurplines(r io.Reader, delim byte) []string {
	b := bufio.NewReader(r)
	var lines []string
	for {
	bytes, err := b.ReadBytes(delim)
	if err != nil {
	if err == io.EOF {
	break
	}
	panic(err)
	}
	// chomp()
	if bytes[len(bytes)-1] == delim {
	bytes = bytes[:len(bytes)-1]
	}
	lines = append(lines, string(bytes))
	}
	return lines
	}

	// parse "1-23"
	func parserange(s string) (uint64, uint64) {
	var a, b uint64
	var separator byte
	n, err := fmt.Sscanf(s, "%v%c%v", &a, &separator, &b)
	if err != nil {
	// "1" => EOF, the most intuitive diagnostic of the year.
	panic(fmt.Sprintf("Sscanf(%v) => %v", s, err))
	}
	switch separator {
	case '-':
	break
	case ':':
	break
	default:
	panic("bad separator")
	}
	if n != 3 {
	panic("sscanf")
	}
	return a, b
	}

	func rangeinclusive(a, b uint64) []string {
	var str []string
	for {
	// string(int) thinks it's a unicode codepoint...
	str = append(str, fmt.Sprintf("%v", a))
	if a == b {
	break
	}
	a++
	}
	return str
	}

	// default source: aes/ctr seeded from crypto/rand
	// math/rand results in shuf being ~20% faster than aes256.
	// aes 192/256 are equally fast.
	// UPDATE: after getting rid of Fprintf(), math/rand is 2x faster!
	// (and almost as fast as GNU/shuf; tested with "ls\|shuf -r")
	// The obvious speedup is to just use a few bytes (we pull 64 bits each time)
	type rng struct {
	stream cipher.Stream
	}

	func (r *rng) Seed(seed int64) { _ = seed }
	func (r *rng) Int63() int64 {
	b := make([]byte, 8)
	r.stream.XORKeyStream(b, b)
	return int64(binary.LittleEndian.Uint64(b) >> 1)
	}
	func NewRandom() rand.Source {
	key := make([]byte, 256/8)
	n, err := urandom.Read(key)
	if err != nil {
	panic(err) // maybe use a math/rand...
	}
	if n != len(key) {
	panic("rand.Read borked")
	}
	b, err := aes.NewCipher(key)
	if err != nil {
	panic(err)
	}
	r := new(rng)
	r.stream = cipher.NewCTR(b, make([]byte, 128/8))
	return r
	}

	// file source: read bytes from file
	type filerng struct {
	name string
	r io.Reader
	}

	func (f *filerng) Seed(seed int64) { _ = seed }
	func (f *filerng) Int63() int64 {
	b := make([]byte, 8)
	n, err := f.r.Read(b)
	if err != nil {
	panic(err)
	}
	if n != len(b) {
	panic("short read from " + f.name)
	}
	return int64(binary.LittleEndian.Uint64(b) >> 1)
	}
	func NewFileRng(filename string) rand.Source {
	f, err := os.Open(filename)
	if err != nil {
	panic(err)
	}
	return &filerng{r: bufio.NewReader(f), name: filename}
	}