rasky/gossaimprovements.go

## gossaimprovements.go
package main

/*
HEAD is:

commit 88b230eaa69647405e7c278044550640fc098111
Author: David Chase <drchase@google.com>
Date:   Fri Jan 29 14:44:15 2016 -0500

    [dev.ssa] cmd/compile: exposed do-log boolean to reduce allocations
*/

import (
	"encoding/binary"
	"fmt"
	"os"
	"strconv"
	"unsafe"
)

var array16 [16]int
var array256 [256]int
var barray16 [16]byte
var bslice []byte

func SliceBoundCheck(data uint32) {
	// This generates one bound-check per iteration. In reality, it would be enough to check
	// once if the slice is smaller than 100 to avoid any bound checking.
	// Obviously this triggers the question if the optimization is valid; if the compiler can
	// *prove* that the loop will panic at some point, is it a valid optimization to panic right
	// away, without doing any iteration?
	for i := 0; i < 100; i++ {
		bslice[i] = byte(i)
	}

	// In this case, there are no excuses. Here the optimizer could always drop the bound check
	// within the loop
	_ = bslice[99]
	for i := 0; i < 100; i++ {
		bslice[i] = byte(i)
	}

	// Same as above, with non-constant bound
	for i := 0; i < int(data); i++ {
		bslice[i] = byte(i)
	}
}

func ExtraBoundChecks(data uint32) {
	// This code generates 4 bound checks, one for each memory access, plus the overhead for
	// generating the temporary throw-away slice
	val := binary.LittleEndian.Uint32(barray16[data:])
	fmt.Println(val)

	// This is an optimized version with just one bound checking (and no throw-away slice
	// generation). I would expect the compiler to generate code similar to this.
	if int(data+3) >= len(barray16) {
		panic("out of bounds")
	}
	ptr := uintptr(unsafe.Pointer(&barray16[data]))
	val2 := (uint32)(*(*uint8)(unsafe.Pointer(ptr + 0))) | (uint32)(*(*uint8)(unsafe.Pointer(ptr + 1)))<<8 |
		(uint32)(*(*uint8)(unsafe.Pointer(ptr + 2)))<<16 | (uint32)(*(*uint8)(unsafe.Pointer(ptr + 3)))<<24
	fmt.Println(val2)
}

func DoubleLoad(data int32) {

	if data>>28 == 0x7 {
		fmt.Println("unlikely")
	}

	fmt.Println(data >> 20)

	/*

	   0x205f  48c7c11c000000          MOVQ $0x1c, CX
	   0x2066  8b542470                MOVL 0x70(SP), DX
	   0x206a  89d0                    MOVL DX, AX
	   0x206c  d3f8                    SARL CL, AX
	   0x206e  83f807                  CMPL $0x7, AX
	   0x2071  0f848c000000            JE 0x2103
	    [....]
	   0x2103  48c7c114000000          MOVQ $0x14, CX
	   0x210a  8b542470                MOVL 0x70(SP), DX      <---- reload from stack in likely path
	   0x210e  89d0                    MOVL DX, AX
	   0x2110  d3f8                    SARL CL, AX

	   (bonus points: I don't know if using the CL shift is better than an immediate shift)
	*/
}

func MissedRotation(data int32) {

	rot := uint((data >> 16) & 0x16)
	val := (data&0xFF)<<rot | (data&0xFF)>>(32-rot)
	fmt.Println(val)

	// This should become a rotation. Note that:
	//  * the AND mask of rot still keeps the value in the 0-31 range (the optimizer should not be
	//    confused by the fact that it's not the "usual" mask 0x1F)
	//  * the data is being AND-ed before being rotated, but that should not prevent the
	//    optimization as well.
}

func BoundChecks(data int32) {
	fmt.Println(array16[data&0xF])

	/*
	    0x2069  488d0d70800a00          LEAQ 0xa8070(IP), CX
	    0x2070  48890c24                MOVQ CX, 0(SP)
	    0x2074  8b442468                MOVL 0x68(SP), AX
	    0x2078  83e00f                  ANDL $0xf, AX
	    0x207b  4863d0                  MOVSXD AX, DX
	    0x207e  4889542430              MOVQ DX, 0x30(SP)
	    0x2083  488d05f6c71800          LEAQ 0x18c7f6(IP), AX
	    0x208a  488d1cd0                LEAQ 0(AX)(DX*8), BX

	    no bound checking = good
	   (the spill is inefficient IMO, but that's not the point)
	*/

	idx := data & 0xF
	fmt.Println(array16[idx])

	/*
	   0x2105  488b4c2430              MOVQ 0x30(SP), CX
	   0x210a  4883f910                CMPQ $0x10, CX
	   0x210e  7374                    JAE 0x2184

	   bounds checking (bad)
	*/
}

func checkBit(data int32, nbit uint) bool {
	return (data>>nbit)&1 != 0
}

func BoolChecks(data int32) {
	what := checkBit(data, 4)
	if what {
		fmt.Println("bit checked")
	}
	/*
	   0x2040  65488b0c25a0080000      GS MOVQ GS:0x8a0, CX
	   0x2049  483b6110                CMPQ 0x10(CX), SP
	   0x204d  0f86c3000000            JBE 0x2116
	   0x2053  4883ec50                SUBQ $0x50, SP
	   0x2057  48c7c104000000          MOVQ $0x4, CX
	   0x205e  8b442458                MOVL 0x58(SP), AX        <--- useless spill to stack
	   0x2062  d3f8                    SARL CL, AX
	   0x2064  83e001                  ANDL $0x1, AX
	   0x2067  83f800                  CMPL $0x0, AX            <--- useless comparison
	   0x206a  0f8488000000            JE 0x20f8
	*/
}

func UselessBitTweaking(data uint16) {
	fmt.Println(array256[(data>>8)&0xFF])

	/*
	   0x2949  488d0590800a00          LEAQ 0xa8090(IP), AX
	   0x2950  48890424                MOVQ AX, 0(SP)       <---- useless spill
	   0x2954  48c7c108000000          MOVQ $0x8, CX
	   0x295b  668b442448              MOVW 0x48(SP), AX
	   0x2960  66d3e8                  SHRW CL, AX
	   0x2963  6625ff00                ANDW $0xff, AX       <----- useless mask
	   0x2967  480fb7c0                MOVZX AX, AX           <------ useless extension
	   0x296b  488d0dcee11800          LEAQ 0x18e1ce(IP), CX
	   0x2972  488d04c1                LEAQ 0(CX)(AX*8), AX
	*/
}

func main() {
	data, _ := strconv.Atoi(os.Args[1])
	BoolChecks(int32(data))
	BoundChecks(int32(data))
	MissedRotation(int32(data))
	DoubleLoad(int32(data))
	ExtraBoundChecks(uint32(data))
	SliceBoundCheck(uint32(data))
	UselessBitTweaking(uint16(data))
}
	package main

	/*
	HEAD is:

	commit 88b230eaa69647405e7c278044550640fc098111
	Author: David Chase <drchase@google.com>
	Date: Fri Jan 29 14:44:15 2016 -0500

	[dev.ssa] cmd/compile: exposed do-log boolean to reduce allocations
	*/

	import (
	"encoding/binary"
	"fmt"
	"os"
	"strconv"
	"unsafe"
	)

	var array16 [16]int
	var array256 [256]int
	var barray16 [16]byte
	var bslice []byte

	func SliceBoundCheck(data uint32) {
	// This generates one bound-check per iteration. In reality, it would be enough to check
	// once if the slice is smaller than 100 to avoid any bound checking.
	// Obviously this triggers the question if the optimization is valid; if the compiler can
	// prove that the loop will panic at some point, is it a valid optimization to panic right
	// away, without doing any iteration?
	for i := 0; i < 100; i++ {
	bslice[i] = byte(i)
	}

	// In this case, there are no excuses. Here the optimizer could always drop the bound check
	// within the loop
	_ = bslice[99]
	for i := 0; i < 100; i++ {
	bslice[i] = byte(i)
	}

	// Same as above, with non-constant bound
	for i := 0; i < int(data); i++ {
	bslice[i] = byte(i)
	}
	}

	func ExtraBoundChecks(data uint32) {
	// This code generates 4 bound checks, one for each memory access, plus the overhead for
	// generating the temporary throw-away slice
	val := binary.LittleEndian.Uint32(barray16[data:])
	fmt.Println(val)

	// This is an optimized version with just one bound checking (and no throw-away slice
	// generation). I would expect the compiler to generate code similar to this.
	if int(data+3) >= len(barray16) {
	panic("out of bounds")
	}
	ptr := uintptr(unsafe.Pointer(&barray16[data]))
	val2 := (uint32)((uint8)(unsafe.Pointer(ptr + 0))) \| (uint32)((uint8)(unsafe.Pointer(ptr + 1)))<<8 \|
	(uint32)((uint8)(unsafe.Pointer(ptr + 2)))<<16 \| (uint32)((uint8)(unsafe.Pointer(ptr + 3)))<<24
	fmt.Println(val2)
	}

	func DoubleLoad(data int32) {

	if data>>28 == 0x7 {
	fmt.Println("unlikely")
	}

	fmt.Println(data >> 20)

	/*

	0x205f 48c7c11c000000 MOVQ $0x1c, CX
	0x2066 8b542470 MOVL 0x70(SP), DX
	0x206a 89d0 MOVL DX, AX
	0x206c d3f8 SARL CL, AX
	0x206e 83f807 CMPL $0x7, AX
	0x2071 0f848c000000 JE 0x2103
	[....]
	0x2103 48c7c114000000 MOVQ $0x14, CX
	0x210a 8b542470 MOVL 0x70(SP), DX <---- reload from stack in likely path
	0x210e 89d0 MOVL DX, AX
	0x2110 d3f8 SARL CL, AX

	(bonus points: I don't know if using the CL shift is better than an immediate shift)
	*/
	}

	func MissedRotation(data int32) {

	rot := uint((data >> 16) & 0x16)
	val := (data&0xFF)<<rot \| (data&0xFF)>>(32-rot)
	fmt.Println(val)

	// This should become a rotation. Note that:
	// * the AND mask of rot still keeps the value in the 0-31 range (the optimizer should not be
	// confused by the fact that it's not the "usual" mask 0x1F)
	// * the data is being AND-ed before being rotated, but that should not prevent the
	// optimization as well.
	}

	func BoundChecks(data int32) {
	fmt.Println(array16[data&0xF])

	/*
	0x2069 488d0d70800a00 LEAQ 0xa8070(IP), CX
	0x2070 48890c24 MOVQ CX, 0(SP)
	0x2074 8b442468 MOVL 0x68(SP), AX
	0x2078 83e00f ANDL $0xf, AX
	0x207b 4863d0 MOVSXD AX, DX
	0x207e 4889542430 MOVQ DX, 0x30(SP)
	0x2083 488d05f6c71800 LEAQ 0x18c7f6(IP), AX
	0x208a 488d1cd0 LEAQ 0(AX)(DX*8), BX

	no bound checking = good
	(the spill is inefficient IMO, but that's not the point)
	*/

	idx := data & 0xF
	fmt.Println(array16[idx])

	/*
	0x2105 488b4c2430 MOVQ 0x30(SP), CX
	0x210a 4883f910 CMPQ $0x10, CX
	0x210e 7374 JAE 0x2184

	bounds checking (bad)
	*/
	}

	func checkBit(data int32, nbit uint) bool {
	return (data>>nbit)&1 != 0
	}

	func BoolChecks(data int32) {
	what := checkBit(data, 4)
	if what {
	fmt.Println("bit checked")
	}
	/*
	0x2040 65488b0c25a0080000 GS MOVQ GS:0x8a0, CX
	0x2049 483b6110 CMPQ 0x10(CX), SP
	0x204d 0f86c3000000 JBE 0x2116
	0x2053 4883ec50 SUBQ $0x50, SP
	0x2057 48c7c104000000 MOVQ $0x4, CX
	0x205e 8b442458 MOVL 0x58(SP), AX <--- useless spill to stack
	0x2062 d3f8 SARL CL, AX
	0x2064 83e001 ANDL $0x1, AX
	0x2067 83f800 CMPL $0x0, AX <--- useless comparison
	0x206a 0f8488000000 JE 0x20f8
	*/
	}

	func UselessBitTweaking(data uint16) {
	fmt.Println(array256[(data>>8)&0xFF])

	/*
	0x2949 488d0590800a00 LEAQ 0xa8090(IP), AX
	0x2950 48890424 MOVQ AX, 0(SP) <---- useless spill
	0x2954 48c7c108000000 MOVQ $0x8, CX
	0x295b 668b442448 MOVW 0x48(SP), AX
	0x2960 66d3e8 SHRW CL, AX
	0x2963 6625ff00 ANDW $0xff, AX <----- useless mask
	0x2967 480fb7c0 MOVZX AX, AX <------ useless extension
	0x296b 488d0dcee11800 LEAQ 0x18e1ce(IP), CX
	0x2972 488d04c1 LEAQ 0(CX)(AX*8), AX
	*/
	}

	func main() {
	data, _ := strconv.Atoi(os.Args[1])
	BoolChecks(int32(data))
	BoundChecks(int32(data))
	MissedRotation(int32(data))
	DoubleLoad(int32(data))
	ExtraBoundChecks(uint32(data))
	SliceBoundCheck(uint32(data))
	UselessBitTweaking(uint16(data))
	}