Skip to content

Instantly share code, notes, and snippets.

@rasky
Created January 30, 2016 12:43
Show Gist options
  • Save rasky/62fba94e3a20d1b05b2a to your computer and use it in GitHub Desktop.
Save rasky/62fba94e3a20d1b05b2a to your computer and use it in GitHub Desktop.
Showcase of optimization opportunities for the Go SSA backend
package main
/*
HEAD is:
commit 88b230eaa69647405e7c278044550640fc098111
Author: David Chase <drchase@google.com>
Date: Fri Jan 29 14:44:15 2016 -0500
[dev.ssa] cmd/compile: exposed do-log boolean to reduce allocations
*/
import (
"encoding/binary"
"fmt"
"os"
"strconv"
"unsafe"
)
var array16 [16]int
var array256 [256]int
var barray16 [16]byte
var bslice []byte
func SliceBoundCheck(data uint32) {
// This generates one bound-check per iteration. In reality, it would be enough to check
// once if the slice is smaller than 100 to avoid any bound checking.
// Obviously this triggers the question if the optimization is valid; if the compiler can
// *prove* that the loop will panic at some point, is it a valid optimization to panic right
// away, without doing any iteration?
for i := 0; i < 100; i++ {
bslice[i] = byte(i)
}
// In this case, there are no excuses. Here the optimizer could always drop the bound check
// within the loop
_ = bslice[99]
for i := 0; i < 100; i++ {
bslice[i] = byte(i)
}
// Same as above, with non-constant bound
for i := 0; i < int(data); i++ {
bslice[i] = byte(i)
}
}
func ExtraBoundChecks(data uint32) {
// This code generates 4 bound checks, one for each memory access, plus the overhead for
// generating the temporary throw-away slice
val := binary.LittleEndian.Uint32(barray16[data:])
fmt.Println(val)
// This is an optimized version with just one bound checking (and no throw-away slice
// generation). I would expect the compiler to generate code similar to this.
if int(data+3) >= len(barray16) {
panic("out of bounds")
}
ptr := uintptr(unsafe.Pointer(&barray16[data]))
val2 := (uint32)(*(*uint8)(unsafe.Pointer(ptr + 0))) | (uint32)(*(*uint8)(unsafe.Pointer(ptr + 1)))<<8 |
(uint32)(*(*uint8)(unsafe.Pointer(ptr + 2)))<<16 | (uint32)(*(*uint8)(unsafe.Pointer(ptr + 3)))<<24
fmt.Println(val2)
}
func DoubleLoad(data int32) {
if data>>28 == 0x7 {
fmt.Println("unlikely")
}
fmt.Println(data >> 20)
/*
0x205f 48c7c11c000000 MOVQ $0x1c, CX
0x2066 8b542470 MOVL 0x70(SP), DX
0x206a 89d0 MOVL DX, AX
0x206c d3f8 SARL CL, AX
0x206e 83f807 CMPL $0x7, AX
0x2071 0f848c000000 JE 0x2103
[....]
0x2103 48c7c114000000 MOVQ $0x14, CX
0x210a 8b542470 MOVL 0x70(SP), DX <---- reload from stack in likely path
0x210e 89d0 MOVL DX, AX
0x2110 d3f8 SARL CL, AX
(bonus points: I don't know if using the CL shift is better than an immediate shift)
*/
}
func MissedRotation(data int32) {
rot := uint((data >> 16) & 0x16)
val := (data&0xFF)<<rot | (data&0xFF)>>(32-rot)
fmt.Println(val)
// This should become a rotation. Note that:
// * the AND mask of rot still keeps the value in the 0-31 range (the optimizer should not be
// confused by the fact that it's not the "usual" mask 0x1F)
// * the data is being AND-ed before being rotated, but that should not prevent the
// optimization as well.
}
func BoundChecks(data int32) {
fmt.Println(array16[data&0xF])
/*
0x2069 488d0d70800a00 LEAQ 0xa8070(IP), CX
0x2070 48890c24 MOVQ CX, 0(SP)
0x2074 8b442468 MOVL 0x68(SP), AX
0x2078 83e00f ANDL $0xf, AX
0x207b 4863d0 MOVSXD AX, DX
0x207e 4889542430 MOVQ DX, 0x30(SP)
0x2083 488d05f6c71800 LEAQ 0x18c7f6(IP), AX
0x208a 488d1cd0 LEAQ 0(AX)(DX*8), BX
no bound checking = good
(the spill is inefficient IMO, but that's not the point)
*/
idx := data & 0xF
fmt.Println(array16[idx])
/*
0x2105 488b4c2430 MOVQ 0x30(SP), CX
0x210a 4883f910 CMPQ $0x10, CX
0x210e 7374 JAE 0x2184
bounds checking (bad)
*/
}
func checkBit(data int32, nbit uint) bool {
return (data>>nbit)&1 != 0
}
func BoolChecks(data int32) {
what := checkBit(data, 4)
if what {
fmt.Println("bit checked")
}
/*
0x2040 65488b0c25a0080000 GS MOVQ GS:0x8a0, CX
0x2049 483b6110 CMPQ 0x10(CX), SP
0x204d 0f86c3000000 JBE 0x2116
0x2053 4883ec50 SUBQ $0x50, SP
0x2057 48c7c104000000 MOVQ $0x4, CX
0x205e 8b442458 MOVL 0x58(SP), AX <--- useless spill to stack
0x2062 d3f8 SARL CL, AX
0x2064 83e001 ANDL $0x1, AX
0x2067 83f800 CMPL $0x0, AX <--- useless comparison
0x206a 0f8488000000 JE 0x20f8
*/
}
func UselessBitTweaking(data uint16) {
fmt.Println(array256[(data>>8)&0xFF])
/*
0x2949 488d0590800a00 LEAQ 0xa8090(IP), AX
0x2950 48890424 MOVQ AX, 0(SP) <---- useless spill
0x2954 48c7c108000000 MOVQ $0x8, CX
0x295b 668b442448 MOVW 0x48(SP), AX
0x2960 66d3e8 SHRW CL, AX
0x2963 6625ff00 ANDW $0xff, AX <----- useless mask
0x2967 480fb7c0 MOVZX AX, AX <------ useless extension
0x296b 488d0dcee11800 LEAQ 0x18e1ce(IP), CX
0x2972 488d04c1 LEAQ 0(CX)(AX*8), AX
*/
}
func main() {
data, _ := strconv.Atoi(os.Args[1])
BoolChecks(int32(data))
BoundChecks(int32(data))
MissedRotation(int32(data))
DoubleLoad(int32(data))
ExtraBoundChecks(uint32(data))
SliceBoundCheck(uint32(data))
UselessBitTweaking(uint16(data))
}
@dr2chase
Copy link

I'll give these a look. The latest bounds check stuff is not visible yet, I need to put that up for review.
There are still a few missing links in the inference chain.

Also, in the case where it can be shown that a loop will eventually panic, we cannot panic early unless there are no side-effects.
Otherwise we need to multi-version the loop, which is probably not in 1.7.

@rasky
Copy link
Author

rasky commented Jan 31, 2016

@dr2chase If we can get the bound checks removed with the "_ = bslice[99]" trick on 1.7, it'd be great and it can be sort of an inside trick until loop versioning is implemented. I think for instance x/image would love to use this trick.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment