Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
benchmark another way to format mysql date and datetime into a `[]byte`
// Go MySQL Driver - A MySQL-Driver for Go's database/sql package
//
// Copyright 2013 The Go-MySQL-Driver Authors. All rights reserved.
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
// You can obtain one at http://mozilla.org/MPL/2.0/.
package mysql
import (
"database/sql/driver"
"encoding/binary"
"fmt"
"testing"
)
func dtNew(src []byte, length uint8) (driver.Value, error) {
// length expects the deterministic length of the zero value,
// negative time and 100+ hours are automatically added if needed
const digits01 = "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"
const digits10 = "0000000000111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999"
if len(src) == 0 {
return zeroDateTime[:length], nil
}
var dst []byte // return value
var p0, p1, p2, p3 byte // current digit pair
var zOffs byte // offset of value in zeroDateTime
switch length {
case 10, 19, 21, 22, 23, 24, 25, 26:
default:
t := "DATE"
if length > 10 {
t += "TIME"
}
return nil, fmt.Errorf("illegal %s length %d", t, length)
}
switch len(src) {
case 4, 7, 11:
default:
t := "DATE"
if length > 10 {
t += "TIME"
}
return nil, fmt.Errorf("illegal %s-packet length %d", t, len(src))
}
dst = make([]byte, 0, length)
// start with the date
year := binary.LittleEndian.Uint16(src[:2])
p0 = byte(year / 100)
p1 = byte(year - 100*uint16(p0))
p2, p3 = src[2], src[3]
dst = append(dst,
digits10[p0], digits01[p0], digits10[p1], digits01[p1], '-',
digits10[p2], digits01[p2], '-',
digits10[p3], digits01[p3],
)
if length == 10 {
return dst, nil
}
if len(src) == 4 {
return append(dst, zeroDateTime[10:length]...), nil
}
p1 = src[4] // hour
src = src[5:]
// p1 is 2-digit hour, src is after hour
p2, p3 = src[0], src[1]
dst = append(dst, ' ',
digits10[p1], digits01[p1], ':',
digits10[p2], digits01[p2], ':',
digits10[p3], digits01[p3],
)
if length <= byte(len(dst)) {
return dst, nil
}
src = src[2:]
if len(src) == 0 {
return append(dst, zeroDateTime[19:zOffs+length]...), nil
}
// microsecs is little endian uint32 with 3 used bytes
// binary.LittleEndian.Uint32(src[:4])
microsecs := uint32(src[0]) | uint32(src[1])<<8 | uint32(src[2])<<16
p1 = byte(microsecs / 10000)
microsecs -= 10000 * uint32(p1)
p2 = byte(microsecs / 100)
microsecs -= 100 * uint32(p2)
p3 = byte(microsecs)
switch decimals := zOffs + length - 20; decimals {
default:
return append(dst, '.',
digits10[p1], digits01[p1],
digits10[p2], digits01[p2],
digits10[p3], digits01[p3],
), nil
case 1:
return append(dst, '.',
digits10[p1],
), nil
case 2:
return append(dst, '.',
digits10[p1], digits01[p1],
), nil
case 3:
return append(dst, '.',
digits10[p1], digits01[p1],
digits10[p2],
), nil
case 4:
return append(dst, '.',
digits10[p1], digits01[p1],
digits10[p2], digits01[p2],
), nil
case 5:
return append(dst, '.',
digits10[p1], digits01[p1],
digits10[p2], digits01[p2],
digits10[p3],
), nil
}
}
// original utils.go: formatBinaryDate
func d_Old(num int, data []byte) (driver.Value, error) {
switch num {
case 0:
return []byte("0000-00-00"), nil
case 4:
return []byte(fmt.Sprintf(
"%04d-%02d-%02d",
binary.LittleEndian.Uint16(data[:2]),
data[2],
data[3],
)), nil
}
return nil, fmt.Errorf("Invalid DATE-packet length %d", num)
}
// original utils.go: formatBinaryDateTime
func dtOld(num int, data []byte) (driver.Value, error) {
switch num {
case 0:
return []byte("0000-00-00 00:00:00"), nil
case 4:
return []byte(fmt.Sprintf(
"%04d-%02d-%02d 00:00:00",
binary.LittleEndian.Uint16(data[:2]),
data[2],
data[3],
)), nil
case 7:
return []byte(fmt.Sprintf(
"%04d-%02d-%02d %02d:%02d:%02d",
binary.LittleEndian.Uint16(data[:2]),
data[2],
data[3],
data[4],
data[5],
data[6],
)), nil
case 11:
return []byte(fmt.Sprintf(
"%04d-%02d-%02d %02d:%02d:%02d.%06d",
binary.LittleEndian.Uint16(data[:2]),
data[2],
data[3],
data[4],
data[5],
data[6],
binary.LittleEndian.Uint32(data[7:11]),
)), nil
}
return nil, fmt.Errorf("Invalid DATETIME-packet length %d", num)
}
func benchNewDT(b *testing.B, src []byte, outlen uint8) {
b.StopTimer()
b.ReportAllocs()
b.StartTimer()
for i := 0; i < b.N; i++ {
// new method, datetime
_, _ = dtNew(src, outlen)
}
}
func benchNewD_(b *testing.B, src []byte, outlen uint8) {
b.StopTimer()
b.ReportAllocs()
b.StartTimer()
for i := 0; i < b.N; i++ {
// new method, date only
_, _ = dtNew(src, outlen)
}
}
func benchOldDT(b *testing.B, src []byte, outlen uint8) {
b.StopTimer()
num := len(src)
b.ReportAllocs()
b.StartTimer()
for i := 0; i < b.N; i++ {
// old method, datetime
_, _ = dtOld(num, src)
}
}
func benchOldD_(b *testing.B, src []byte, outlen uint8) {
b.StopTimer()
num := len(src)
b.ReportAllocs()
b.StartTimer()
for i := 0; i < b.N; i++ {
// old method, date only
_, _ = dtOld(num, src)
}
}
var rawDate = []byte{
2012 / 256, 2012 % 256, // year
10, // month
13, // day
15, // hour
34, // minute
59, // second
6, 18, 15, 0, // microsecond (987654)
}
func BenchmarkFormatNewD_00(b *testing.B) { benchNewD_(b, rawDate[:0], 10) }
func BenchmarkFormatOldD_00(b *testing.B) { benchOldD_(b, rawDate[:0], 10) }
func BenchmarkFormatNewD_04(b *testing.B) { benchNewD_(b, rawDate[:4], 10) }
func BenchmarkFormatOldD_04(b *testing.B) { benchOldD_(b, rawDate[:4], 10) }
func BenchmarkFormatNewDT00(b *testing.B) { benchNewDT(b, rawDate[:0], 19) }
func BenchmarkFormatOldDT00(b *testing.B) { benchOldDT(b, rawDate[:0], 19) }
func BenchmarkFormatNewDT04(b *testing.B) { benchNewDT(b, rawDate[:4], 19) }
func BenchmarkFormatOldDT04(b *testing.B) { benchOldDT(b, rawDate[:4], 19) }
func BenchmarkFormatNewDT07(b *testing.B) { benchNewDT(b, rawDate[:7], 19) }
func BenchmarkFormatOldDT07(b *testing.B) { benchOldDT(b, rawDate[:7], 19) }
func BenchmarkFormatNewDT11(b *testing.B) { benchNewDT(b, rawDate[:11], 26) }
func BenchmarkFormatOldDT11(b *testing.B) { benchOldDT(b, rawDate[:11], 26) }
@arnehormann
Copy link
Author

arnehormann commented Dec 12, 2013

Results on my system:

$ go test -run=- -bench=BenchmarkFormat*
PASS
BenchmarkFormatNewD_00  20000000           128 ns/op          48 B/op          2 allocs/op
BenchmarkFormatOldD_00  20000000           140 ns/op          64 B/op          2 allocs/op
BenchmarkFormatNewD_04  10000000           161 ns/op          48 B/op          2 allocs/op
BenchmarkFormatOldD_04   2000000           901 ns/op          96 B/op          3 allocs/op
BenchmarkFormatNewDT00  20000000           138 ns/op          64 B/op          2 allocs/op
BenchmarkFormatOldDT00  20000000           137 ns/op          64 B/op          2 allocs/op
BenchmarkFormatNewDT04  10000000           171 ns/op          64 B/op          2 allocs/op
BenchmarkFormatOldDT04   2000000           890 ns/op          96 B/op          3 allocs/op
BenchmarkFormatNewDT07  10000000           188 ns/op          64 B/op          2 allocs/op
BenchmarkFormatOldDT07   1000000          1336 ns/op          96 B/op          3 allocs/op
BenchmarkFormatNewDT11  10000000           216 ns/op          64 B/op          2 allocs/op
BenchmarkFormatOldDT11   1000000          1624 ns/op          96 B/op          3 allocs/op

@arnehormann
Copy link
Author

arnehormann commented Dec 12, 2013

Also tried this with if instead of switch, different order (the current one makes error handling easier) and less local vars - no visible improvement. The version below shows all those changes I tried mashed into one.

func dtNew2(src []byte, withTime bool) (driver.Value, error) {
    const zeroDateTimeMicros = "0000-00-00 00:00:00.000000"
    var dst []byte
    srclen := len(src)
    if withTime {
        if srclen == 11 {
            dst = []byte(zeroDateTimeMicros)
        } else {
            dst = []byte(zeroDateTimeMicros[:19])
        }
    } else {
        dst = []byte(zeroDateTimeMicros[:10])
    }
    if srclen == 0 {
        return dst, nil
    }
    var tmp uint
    tmp = uint(binary.LittleEndian.Uint16(src[:2])) // year
    dst[0] += byte((tmp / 1000) % 10)
    dst[1] += byte((tmp / 100) % 10)
    dst[2] += byte((tmp / 10) % 10)
    dst[3] += byte(tmp % 10)
    tmp = uint(src[2]) // month
    dst[5] += byte((tmp / 10) % 10)
    dst[6] += byte(tmp % 10)
    tmp = uint(src[3]) // day
    dst[8] += byte((tmp / 10) % 10)
    dst[9] += byte(tmp % 10)
    if srclen == 4 {
        return dst, nil
    }
    tmp = uint(src[4]) // hour
    dst[11] += byte((tmp / 10) % 10)
    dst[12] += byte(tmp % 10)
    tmp = uint(src[5]) // minute
    dst[14] += byte((tmp / 10) % 10)
    dst[15] += byte(tmp % 10)
    tmp = uint(src[6]) // second
    dst[17] += byte((tmp / 10) % 10)
    dst[18] += byte(tmp % 10)
    if srclen == 7 {
        return dst, nil
    }
    tmp = uint(binary.LittleEndian.Uint32(src[7:11])) // micro seconds
    dst[20] += byte((tmp / 100000) % 10)
    dst[21] += byte((tmp / 10000) % 10)
    dst[22] += byte((tmp / 1000) % 10)
    dst[23] += byte((tmp / 100) % 10)
    dst[24] += byte((tmp / 10) % 10)
    dst[25] += byte(tmp % 10)
    if srclen == 11 {
        return dst, nil
    }
    var mode string
    if withTime {
        mode = "DATETIME"
    } else {
        mode = "DATE"
    }
    return nil, fmt.Errorf("invalid %s-packet length %d", mode, srclen)
}

@arnehormann
Copy link
Author

arnehormann commented Dec 13, 2013

no difference for

    if withTime {
        if srclen == 11 {
            dst = []byte{
                '0', '0', '0', '0', '-', '0', '0', '-', '0', '0',
                ' ', '0', '0', ':', '0', '0', ':', '0', '0',
                '.', '0', '0', '0', '0', '0', '0',
            }
        } else {
            dst = []byte{
                '0', '0', '0', '0', '-', '0', '0', '-', '0', '0',
                ' ', '0', '0', ':', '0', '0', ':', '0', '0',
            }
        }
    } else {
        dst = []byte{
            '0', '0', '0', '0', '-', '0', '0', '-', '0', '0',
        }
    }

... how do I get rid of one more allocation?!?

@julienschmidt
Copy link

julienschmidt commented Dec 13, 2013

You could try to reuse the src slice as dst.
Implementing it with if instead of switch should actually be a few nanoseconds slower, since the compiler can use a jump table for switch :-P

@arnehormann
Copy link
Author

arnehormann commented Dec 13, 2013

I thought that's only when the values are more or less continous - and for switch in ascending order.
Well, currently go tool 6g -S FILE.go (Go version from yesterday) is pretty much silent and doesn't yield anything, no output at all. And go tool 6l -a FILE.6 is much too verbose. So I tried but couldn't check. Something's borked in my installation.

Reusing source: but it's not long enough? Should I just overwrite "earlier" parts of the buffer?

@julienschmidt
Copy link

julienschmidt commented Dec 13, 2013

ASM with 6g from Go 1.2: http://pastebin.com/F9ny3nuf

@julienschmidt
Copy link

julienschmidt commented Dec 13, 2013

I removed the 'testing' import, so line numbers are -1 each ;)

@arnehormann
Copy link
Author

arnehormann commented Dec 13, 2013

@julienschmidt thanks for the assembly.
There is an uglier / faster version (consistently ~10% for BenchmarkFormatNewDT11 - down to about 190ns):
Get rid of modulo to cut down duplicate operations in assembly.

Taking e.g. month, replace

dst[5] += (month / 10) % 10
dst[6] += month % 10

with

tmp = month / 10
dst[6] += month - 10*tmp
dst[5] += tmp

Fast but ugly. I don't know if it's worth it - but as the code probably won't be touched again for some time and this gist can always be referenced in a comment...

@arnehormann
Copy link
Author

arnehormann commented Dec 13, 2013

I changed the version in the gist to use the "improvement" from my latest comment.
We can always jump back with the history...

@julienschmidt
Copy link

julienschmidt commented Dec 13, 2013

a := []byte{
    '0', '0', '0', '0', '-', '0', '0', '-', '0', '0',
    ' ', '0', '0', ':', '0', '0', ':', '0', '0',
    '.', '0', '0', '0', '0', '0', '0',
}

b := []byte("0000-00-00 00:00:00.000000")

a and b compile to the same assembly btw.

@arnehormann
Copy link
Author

arnehormann commented Dec 14, 2013

@julienschmidt if you are ok with it, please delete your asm comment - it doesn't reflect the current version and I got what I wanted out of it, but it makes the site slower, the discussion longer and Chrome says the site is in Albanian and offers a translation (?!?).

Are you ok with the code as it is here or do you want something changed in the PR?
aaaand I know this is a kinda ridiculous overengineering of an already fast function not called overly much. Still, fun 😀

@julienschmidt
Copy link

julienschmidt commented Dec 14, 2013

Moved the ASM to pastebin.

Here is a new snippet generated by pprof --disasm, showing the reason for the additional allocation:

     . 3238.5    84: return dst, nil
     .      .      43731a: MOVQ $4d3460,0(SP)
     .      .      437322: MOVQ 88(SP),BX
     .      .      43732a: MOVQ BX,8(SP)
     .      .      43732f: MOVQ CX,10(SP)
     .      .      437334: MOVQ DI,18(SP)
     . 3238.5      437339: CALL runtime.convT2E(SB)
     .      .      43733e: MOVQ 20(SP),BX
     .      .      437343: MOVQ BX,e8(SP)
     .      .      43734b: MOVQ 28(SP),BX
     .      .      437350: MOVQ BX,f0(SP)
     .      .      437358: MOVQ $0,f8(SP)
     .      .      437364: MOVQ $0,100(SP)
     .      .      437370: ADDQ $c0,SP
     .      .      437377:    RET

@julienschmidt
Copy link

julienschmidt commented Dec 14, 2013

As said earlier, the len=0 case could still be optimized. Something like:

var zeroDateTime []byte = []byte("0000-00-00 00:00:00")

func dtNew(src []byte, withTime bool) (driver.Value, error) {
    if len(src) == 0 {
        if withTime {
            return zeroDateTime, nil
        }
        return zeroDateTime[:10], nil
    }
    ...

@arnehormann
Copy link
Author

arnehormann commented Jun 4, 2014

I did a rewrite to include another approach with appending digit tuples.
This also enables a new api which lets you specify the output length independent of the input length.
No significant slowdown:

BenchmarkFormatNewD_00  50000000            69.8 ns/op        32 B/op          1 allocs/op
BenchmarkFormatNew2D_00 50000000            69.9 ns/op        32 B/op          1 allocs/op
BenchmarkFormatOldD_00  20000000           132 ns/op          64 B/op          2 allocs/op
BenchmarkFormatNewD_04  10000000           191 ns/op          48 B/op          2 allocs/op
BenchmarkFormatNew2D_04 10000000           204 ns/op          48 B/op          2 allocs/op
BenchmarkFormatOldD_04   2000000           833 ns/op          96 B/op          3 allocs/op
BenchmarkFormatNewDT00  50000000            69.6 ns/op        32 B/op          1 allocs/op
BenchmarkFormatNew2DT00 50000000            70.3 ns/op        32 B/op          1 allocs/op
BenchmarkFormatOldDT00  20000000           131 ns/op          64 B/op          2 allocs/op
BenchmarkFormatNewDT04  10000000           198 ns/op          64 B/op          2 allocs/op
BenchmarkFormatNew2DT04 10000000           222 ns/op          64 B/op          2 allocs/op
BenchmarkFormatOldDT04   2000000           833 ns/op          96 B/op          3 allocs/op
BenchmarkFormatNewDT07  10000000           209 ns/op          64 B/op          2 allocs/op
BenchmarkFormatNew2DT07 10000000           250 ns/op          64 B/op          2 allocs/op
BenchmarkFormatOldDT07   1000000          1272 ns/op          96 B/op          3 allocs/op
BenchmarkFormatNewDT11  10000000           229 ns/op          64 B/op          2 allocs/op
BenchmarkFormatNew2DT11 10000000           289 ns/op          64 B/op          2 allocs/op
BenchmarkFormatOldDT11   1000000          1496 ns/op          96 B/op          3 allocs/op

@arnehormann
Copy link
Author

arnehormann commented Jun 5, 2014

And with the current version (2014-06-05), I beat the old one and gain flexible output length handling.

BenchmarkFormatNewD_00    50000000      69.4 ns/op    32 B/op     1 allocs/op
BenchmarkFormatNew2D_00   50000000      69.4 ns/op    32 B/op     1 allocs/op
BenchmarkFormatOldD_00    20000000     130.0 ns/op    64 B/op     2 allocs/op

BenchmarkFormatNewD_04    10000000     189.0 ns/op    48 B/op     2 allocs/op
BenchmarkFormatNew2D_04   10000000     179.0 ns/op    48 B/op     2 allocs/op
BenchmarkFormatOldD_04     2000000     823.0 ns/op    96 B/op     3 allocs/op

BenchmarkFormatNewDT00    50000000      69.3 ns/op    32 B/op     1 allocs/op
BenchmarkFormatNew2DT00   50000000      69.2 ns/op    32 B/op     1 allocs/op
BenchmarkFormatOldDT00    20000000     129.0 ns/op    64 B/op     2 allocs/op

BenchmarkFormatNewDT04    10000000     200.0 ns/op    64 B/op     2 allocs/op
BenchmarkFormatNew2DT04   10000000     194.0 ns/op    64 B/op     2 allocs/op
BenchmarkFormatOldDT04     2000000     824.0 ns/op    96 B/op     3 allocs/op

BenchmarkFormatNewDT07    10000000     211.0 ns/op    64 B/op     2 allocs/op
BenchmarkFormatNew2DT07   10000000     205.0 ns/op    64 B/op     2 allocs/op
BenchmarkFormatOldDT07     1000000    1256.0 ns/op    96 B/op     3 allocs/op

BenchmarkFormatNewDT11    10000000     231.0 ns/op    64 B/op     2 allocs/op
BenchmarkFormatNew2DT11   10000000     225.0 ns/op    64 B/op     2 allocs/op
BenchmarkFormatOldDT11     1000000    1489.0 ns/op    96 B/op     3 allocs/op

@arnehormann
Copy link
Author

arnehormann commented Jun 7, 2014

improved ...New2 with inspiration from strconv and replaced ...New with it.

BenchmarkFormatNewD_00  50000000            68.6 ns/op        32 B/op          1 allocs/op
BenchmarkFormatOldD_00  20000000           129 ns/op          64 B/op          2 allocs/op
BenchmarkFormatNewD_04  10000000           175 ns/op          48 B/op          2 allocs/op
BenchmarkFormatOldD_04   2000000           848 ns/op          96 B/op          3 allocs/op
BenchmarkFormatNewDT00  50000000            68.2 ns/op        32 B/op          1 allocs/op
BenchmarkFormatOldDT00  20000000           129 ns/op          64 B/op          2 allocs/op
BenchmarkFormatNewDT04  10000000           193 ns/op          64 B/op          2 allocs/op
BenchmarkFormatOldDT04   2000000           844 ns/op          96 B/op          3 allocs/op
BenchmarkFormatNewDT07  10000000           201 ns/op          64 B/op          2 allocs/op
BenchmarkFormatOldDT07   1000000          1294 ns/op          96 B/op          3 allocs/op
BenchmarkFormatNewDT11  10000000           221 ns/op          64 B/op          2 allocs/op
BenchmarkFormatOldDT11   1000000          1498 ns/op          96 B/op          3 allocs/op

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment