Skip to content

Instantly share code, notes, and snippets.

@klauspost
Created March 24, 2022 12:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save klauspost/8f8dbbd9745662464dfac37d00cbd5f6 to your computer and use it in GitHub Desktop.
Save klauspost/8f8dbbd9745662464dfac37d00cbd5f6 to your computer and use it in GitHub Desktop.
package main
//go:generate go run gen.go -out decompress_amd64_avo.s -stubs delme.go -pkg=huff0
import (
"flag"
"io/ioutil"
"os"
"path/filepath"
"strconv"
_ "github.com/klauspost/compress"
. "github.com/mmcloughlin/avo/build"
"github.com/mmcloughlin/avo/buildtags"
"github.com/mmcloughlin/avo/gotypes"
. "github.com/mmcloughlin/avo/operand"
"github.com/mmcloughlin/avo/reg"
)
func main() {
flag.Parse()
out := flag.Lookup("out")
os.Remove(filepath.Join("..", out.Value.String()))
stub := flag.Lookup("stubs")
if stub.Value.String() != "" {
os.Remove(stub.Value.String())
defer os.Remove(stub.Value.String())
}
Constraint(buildtags.Not("appengine").ToConstraint())
Constraint(buildtags.Not("noasm").ToConstraint())
Constraint(buildtags.Term("gc").ToConstraint())
Constraint(buildtags.Not("noasm").ToConstraint())
decompress := decompress4x{}
decompress.generateProcedure("decompress4x_main_loop_x86")
decompress.bmi2 = true
decompress.generateProcedure("decompress4x_main_loop_bmi2")
Generate()
b, err := ioutil.ReadFile(out.Value.String())
if err != nil {
panic(err)
}
const readOnly = 0444
err = ioutil.WriteFile(filepath.Join("..", out.Value.String()), b, readOnly)
if err != nil {
panic(err)
}
os.Remove(out.Value.String())
}
type decompress4x struct {
bmi2 bool
}
const buffoff = 256 // see decompress.go, we're using [4][256]byte table
func (d decompress4x) generateProcedure(name string) {
Package("github.com/klauspost/compress/huff0")
TEXT(name, 0, "func(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, peekBits uint8, buf *byte, tbl *dEntrySingle) uint8")
Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "")
Pragma("noescape")
out := reg.RAX // Fixed since we need 8H
offsetComp, err := ReturnIndex(0).Resolve()
if err != nil {
panic(err)
}
offP := offsetComp.Addr
{
off := GP8()
XORB(off, off) // off = 0
MOVB(off, offP)
}
exhausted := reg.RBX // Fixed since we need 8H
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false
peekBits := GP64()
buffer := GP64()
table := GP64()
Comment("Preload values")
{
Load(Param("peekBits"), peekBits)
Load(Param("buf"), buffer)
Load(Param("tbl"), table)
}
Comment("Main loop")
Label("main_loop")
br0 := Dereference(Param("pbr0"))
d.decodeTwoValues(0, br0, peekBits, table, buffer, out, exhausted, offP)
br1 := Dereference(Param("pbr1"))
d.decodeTwoValues(1, br1, peekBits, table, buffer, out, exhausted, offP)
br2 := Dereference(Param("pbr2"))
d.decodeTwoValues(2, br2, peekBits, table, buffer, out, exhausted, offP)
br3 := Dereference(Param("pbr3"))
d.decodeTwoValues(3, br3, peekBits, table, buffer, out, exhausted, offP)
ADDB(U8(2), offP) // off += 2
TESTB(exhausted.As8H(), exhausted.As8H()) // any br[i].ofs < 4?
JNZ(LabelRef("done"))
CMPQ(offP, U32(buffoff))
JZ(LabelRef("main_loop"))
Label("done")
RET()
}
func (d decompress4x) decodeTwoValues(id int, br gotypes.Component, peekBits, table, buffer reg.GPVirtual, out, exhausted reg.GPPhysical, offP Mem) {
Commentf("br%d.fillFast()", id)
brOffset := GP64()
brBitsRead := GP64()
brValue := GP64()
Load(br.Field("bitsRead"), brBitsRead)
Load(br.Field("off"), brOffset)
Load(br.Field("value"), brValue)
// We must have at least 2 * max tablelog left
CMPQ(brBitsRead, U8(64-22))
JBE(LabelRef("skip_fill" + strconv.Itoa(id)))
SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32
SUBQ(U8(4), brOffset) // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
tmp := GP64()
Load(br.Field("in").Base(), tmp.As64())
Comment("b.value |= uint64(low) << (b.bitsRead & 63)")
CX := reg.CL
addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1}
if d.bmi2 {
SHLXQ(brBitsRead, addr, tmp.As64()) // tmp = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
} else {
MOVL(addr, tmp.As32()) // tmp = uint32(b.in[b.off:b.off+4])
MOVQ(brBitsRead, CX.As64())
SHLQ(CX, tmp.As64())
}
ORQ(tmp.As64(), brValue)
Commentf("exhausted = exhausted || (br%d.off < 4)", id)
CMPQ(brOffset, U8(4))
SETLT(exhausted.As8L())
ORB(exhausted.As8L(), exhausted.As8H())
Label("skip_fill" + strconv.Itoa(id))
tmp = GP64()
Commentf("val0 := br%d.peekTopBits(peekBits)", id)
if d.bmi2 {
SHRXQ(peekBits, brValue, tmp.As64()) // tmp = (value >> peek_bits) & mask
} else {
MOVQ(brValue, tmp.As64())
MOVQ(peekBits, CX.As64())
SHRQ(CX, tmp.As64()) // tmp = (value >> peek_bits) & mask
}
Comment("v0 := table[val0&mask]")
tmp8 := reg.RDX
MOVW(Mem{Base: table, Index: tmp8.As64(), Scale: 2}, tmp8.As16()) // tmp - v0
Commentf("br%d.advance(uint8(v0.entry)", id)
MOVB(tmp8.As8H(), out.As8()) // BL = uint8(v0.entry >> 8)
MOVBQZX(tmp8.As8(), CX.As64())
if d.bmi2 {
SHLXQ(tmp8.As64(), brValue, brValue) // value <<= n
} else {
SHLQ(CX, brValue) // value <<= n
}
ADDQ(CX.As64(), brBitsRead) // bits_read += n
Commentf("val1 := br%d.peekTopBits(peekBits)", id)
if d.bmi2 {
SHRXQ(peekBits, brValue, tmp8.As64()) // tmp = (value >> peek_bits) & mask
} else {
MOVQ(peekBits, CX.As64())
MOVQ(brValue, tmp8.As64())
SHRQ(CX, tmp8.As64()) // tmp = (value >> peek_bits) & mask
}
Comment("v1 := table[val0&mask]")
MOVW(Mem{Base: table, Index: tmp8.As64(), Scale: 2}, tmp8.As16()) // tmp - v1
Commentf("br%d.advance(uint8(v1.entry))", id)
MOVB(tmp8.As8H(), out.As8H()) // BH = uint8(v0.entry >> 8)
MOVBQZX(tmp8.As8(), CX.As64())
if d.bmi2 {
SHLXQ(tmp8.As64(), brValue, brValue) // value <<= n
} else {
SHLQ(CX, brValue) // value <<= n
}
ADDQ(CX.As64(), brBitsRead) // bits_read += n
Comment("these two writes get coalesced")
Comment("buf[stream][off] = uint8(v0.entry >> 8)")
Comment("buf[stream][off+1] = uint8(v1.entry >> 8)")
off := GP64()
MOVBQZX(offP, off)
MOVW(out.As16(), Mem{Base: buffer, Index: off, Scale: 1, Disp: id * buffoff})
Comment("update the bitrader reader structure")
Store(brBitsRead.As8(), br.Field("bitsRead"))
Store(brValue, br.Field("value"))
Store(brOffset, br.Field("value"))
}
func IfDef(def string) {
Commentf("#ifdef %s", def)
}
func Else() {
Comment("#else")
}
func EndIf() {
Comment("#endif")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment