Created
January 30, 2016 12:43
-
-
Save rasky/62fba94e3a20d1b05b2a to your computer and use it in GitHub Desktop.
Showcase of optimization opportunities for the Go SSA backend
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
/* | |
HEAD is: | |
commit 88b230eaa69647405e7c278044550640fc098111 | |
Author: David Chase <drchase@google.com> | |
Date: Fri Jan 29 14:44:15 2016 -0500 | |
[dev.ssa] cmd/compile: exposed do-log boolean to reduce allocations | |
*/ | |
import ( | |
"encoding/binary" | |
"fmt" | |
"os" | |
"strconv" | |
"unsafe" | |
) | |
var array16 [16]int | |
var array256 [256]int | |
var barray16 [16]byte | |
var bslice []byte | |
func SliceBoundCheck(data uint32) { | |
// This generates one bound-check per iteration. In reality, it would be enough to check | |
// once if the slice is smaller than 100 to avoid any bound checking. | |
// Obviously this triggers the question if the optimization is valid; if the compiler can | |
// *prove* that the loop will panic at some point, is it a valid optimization to panic right | |
// away, without doing any iteration? | |
for i := 0; i < 100; i++ { | |
bslice[i] = byte(i) | |
} | |
// In this case, there are no excuses. Here the optimizer could always drop the bound check | |
// within the loop | |
_ = bslice[99] | |
for i := 0; i < 100; i++ { | |
bslice[i] = byte(i) | |
} | |
// Same as above, with non-constant bound | |
for i := 0; i < int(data); i++ { | |
bslice[i] = byte(i) | |
} | |
} | |
func ExtraBoundChecks(data uint32) { | |
// This code generates 4 bound checks, one for each memory access, plus the overhead for | |
// generating the temporary throw-away slice | |
val := binary.LittleEndian.Uint32(barray16[data:]) | |
fmt.Println(val) | |
// This is an optimized version with just one bound checking (and no throw-away slice | |
// generation). I would expect the compiler to generate code similar to this. | |
if int(data+3) >= len(barray16) { | |
panic("out of bounds") | |
} | |
ptr := uintptr(unsafe.Pointer(&barray16[data])) | |
val2 := (uint32)(*(*uint8)(unsafe.Pointer(ptr + 0))) | (uint32)(*(*uint8)(unsafe.Pointer(ptr + 1)))<<8 | | |
(uint32)(*(*uint8)(unsafe.Pointer(ptr + 2)))<<16 | (uint32)(*(*uint8)(unsafe.Pointer(ptr + 3)))<<24 | |
fmt.Println(val2) | |
} | |
func DoubleLoad(data int32) { | |
if data>>28 == 0x7 { | |
fmt.Println("unlikely") | |
} | |
fmt.Println(data >> 20) | |
/* | |
0x205f 48c7c11c000000 MOVQ $0x1c, CX | |
0x2066 8b542470 MOVL 0x70(SP), DX | |
0x206a 89d0 MOVL DX, AX | |
0x206c d3f8 SARL CL, AX | |
0x206e 83f807 CMPL $0x7, AX | |
0x2071 0f848c000000 JE 0x2103 | |
[....] | |
0x2103 48c7c114000000 MOVQ $0x14, CX | |
0x210a 8b542470 MOVL 0x70(SP), DX <---- reload from stack in likely path | |
0x210e 89d0 MOVL DX, AX | |
0x2110 d3f8 SARL CL, AX | |
(bonus points: I don't know if using the CL shift is better than an immediate shift) | |
*/ | |
} | |
func MissedRotation(data int32) { | |
rot := uint((data >> 16) & 0x16) | |
val := (data&0xFF)<<rot | (data&0xFF)>>(32-rot) | |
fmt.Println(val) | |
// This should become a rotation. Note that: | |
// * the AND mask of rot still keeps the value in the 0-31 range (the optimizer should not be | |
// confused by the fact that it's not the "usual" mask 0x1F) | |
// * the data is being AND-ed before being rotated, but that should not prevent the | |
// optimization as well. | |
} | |
func BoundChecks(data int32) { | |
fmt.Println(array16[data&0xF]) | |
/* | |
0x2069 488d0d70800a00 LEAQ 0xa8070(IP), CX | |
0x2070 48890c24 MOVQ CX, 0(SP) | |
0x2074 8b442468 MOVL 0x68(SP), AX | |
0x2078 83e00f ANDL $0xf, AX | |
0x207b 4863d0 MOVSXD AX, DX | |
0x207e 4889542430 MOVQ DX, 0x30(SP) | |
0x2083 488d05f6c71800 LEAQ 0x18c7f6(IP), AX | |
0x208a 488d1cd0 LEAQ 0(AX)(DX*8), BX | |
no bound checking = good | |
(the spill is inefficient IMO, but that's not the point) | |
*/ | |
idx := data & 0xF | |
fmt.Println(array16[idx]) | |
/* | |
0x2105 488b4c2430 MOVQ 0x30(SP), CX | |
0x210a 4883f910 CMPQ $0x10, CX | |
0x210e 7374 JAE 0x2184 | |
bounds checking (bad) | |
*/ | |
} | |
func checkBit(data int32, nbit uint) bool { | |
return (data>>nbit)&1 != 0 | |
} | |
func BoolChecks(data int32) { | |
what := checkBit(data, 4) | |
if what { | |
fmt.Println("bit checked") | |
} | |
/* | |
0x2040 65488b0c25a0080000 GS MOVQ GS:0x8a0, CX | |
0x2049 483b6110 CMPQ 0x10(CX), SP | |
0x204d 0f86c3000000 JBE 0x2116 | |
0x2053 4883ec50 SUBQ $0x50, SP | |
0x2057 48c7c104000000 MOVQ $0x4, CX | |
0x205e 8b442458 MOVL 0x58(SP), AX <--- useless spill to stack | |
0x2062 d3f8 SARL CL, AX | |
0x2064 83e001 ANDL $0x1, AX | |
0x2067 83f800 CMPL $0x0, AX <--- useless comparison | |
0x206a 0f8488000000 JE 0x20f8 | |
*/ | |
} | |
func UselessBitTweaking(data uint16) { | |
fmt.Println(array256[(data>>8)&0xFF]) | |
/* | |
0x2949 488d0590800a00 LEAQ 0xa8090(IP), AX | |
0x2950 48890424 MOVQ AX, 0(SP) <---- useless spill | |
0x2954 48c7c108000000 MOVQ $0x8, CX | |
0x295b 668b442448 MOVW 0x48(SP), AX | |
0x2960 66d3e8 SHRW CL, AX | |
0x2963 6625ff00 ANDW $0xff, AX <----- useless mask | |
0x2967 480fb7c0 MOVZX AX, AX <------ useless extension | |
0x296b 488d0dcee11800 LEAQ 0x18e1ce(IP), CX | |
0x2972 488d04c1 LEAQ 0(CX)(AX*8), AX | |
*/ | |
} | |
func main() { | |
data, _ := strconv.Atoi(os.Args[1]) | |
BoolChecks(int32(data)) | |
BoundChecks(int32(data)) | |
MissedRotation(int32(data)) | |
DoubleLoad(int32(data)) | |
ExtraBoundChecks(uint32(data)) | |
SliceBoundCheck(uint32(data)) | |
UselessBitTweaking(uint16(data)) | |
} |
@dr2chase If we can get the bound checks removed with the "_ = bslice[99]" trick on 1.7, it'd be great and it can be sort of an inside trick until loop versioning is implemented. I think for instance x/image would love to use this trick.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'll give these a look. The latest bounds check stuff is not visible yet, I need to put that up for review.
There are still a few missing links in the inference chain.
Also, in the case where it can be shown that a loop will eventually panic, we cannot panic early unless there are no side-effects.
Otherwise we need to multi-version the loop, which is probably not in 1.7.