Skip to content

Instantly share code, notes, and snippets.

@pirgeo
Created July 13, 2021 15:34
Show Gist options
  • Save pirgeo/9f713e486a1b1e8e479dc163456b835d to your computer and use it in GitHub Desktop.
Save pirgeo/9f713e486a1b1e8e479dc163456b835d to your computer and use it in GitHub Desktop.
Mapping to histogram buckets in an exponential layout
// Copyright 2021 Dynatrace LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package exponential
import (
"math"
"math/bits"
)
type Layout interface {
MapToIndex(float64) int
}
type exponentialLayout struct {
// there are 2^precision buckets for the mantissa to map to.
precision int
// An array of indices (lookup table). Mantissas of floats are mapped to this array
// of equidistant buckets, which points to a bucket in the boundaries array
// that is no further away than two buckets from the correct target bucket.
indices []int
// the array of boundaries, represented as the mantissas of the double values in bits.
boundaries []uint64
// integer representation of the smallest value that should be treated normally
// before that every double value has its own bucket.
// This is the first bit where two doubles can fall into one bucket
firstNormalValueBits uint64
// The offset which determines how many indices are skipped for subnormal floats
indexOffset int
}
func NewExponentialLayout(precision int) *exponentialLayout {
bounds := calculateBoundaries(precision)
indices := calculateIndices(bounds, precision)
fnvb, io := getIndexOffset(precision, indices, bounds)
return &exponentialLayout{
precision: precision,
indices: indices,
boundaries: bounds,
firstNormalValueBits: fnvb,
indexOffset: io,
}
}
// calculate boundaries for the mantissa part only.
// This calculates the bucket boundaries independent of the exponent.
// This layout is the same for all mantissas, independent of the exponents.
// It depends only on the desired number of buckets subdividing the [1, 2] range covered by the mantissa.
// The number of buckets is 2^precision
func calculateBoundaries(precision int) []uint64 {
length := 1 << precision
boundaries := make([]uint64, length+1)
for i := 0; i < len(boundaries)-1; i++ {
boundaries[i] = 0x000fffffffffffff & math.Float64bits(math.Pow(2, float64(i+1)/float64(length)))
}
boundaries[length-1] = 0x0010000000000000
boundaries[length] = 0x0010000000000000
return boundaries
}
// Create the array which is roughly mapping into the boundaries array
func calculateIndices(boundaries []uint64, precision int) []int {
length := 1 << precision
indices := make([]int, length)
c := 0
for i := 0; i < length; i++ {
// e.g. for precision = 2, this evaluates to:
// i=0: 1.0; i=1: 1.25; i=2: 1.5; i=3: 1.75
mantissaLowerBound := uint64(i) << (52 - precision)
// find the lowest boundary that is smaller than or equal to the equidistant bucket bound
for boundaries[c] <= mantissaLowerBound {
c++
}
indices[i] = c
}
return indices
}
// It is possible that in the subnormal range, there are a lot of buckets that cannot have any contents
// because the IEEE double specification cannot be used to represent values in that range. In order to not create
// a large number of extremely small buckets that can never contain any values, this function calculates the point where
// two doubles are mapped to the same index.
// This point is stored and used as an offset later on
func getIndexOffset(precision int, indices []int, boundaries []uint64) (uint64, int) {
// helper layout to determine the index offset and the first normal value bits
layout := exponentialLayout{
precision: precision,
indices: indices,
boundaries: boundaries,
indexOffset: 0,
firstNormalValueBits: 0,
}
var valueBits uint64 = 0
var index int = math.MinInt64
for {
nextValueBits := valueBits + 1
nextIndex := layout.mapToBinIndexHelper(nextValueBits)
if index == nextIndex {
break
}
valueBits = nextValueBits
index = nextIndex
}
return valueBits, int(valueBits) - index
}
// This is the code that actually maps the double value to the correct bin.
func (el exponentialLayout) mapToBinIndexHelper(valueBits uint64) int {
// The last 52 bits (bits 0 through 51) of a double are the mantissa.
// Get these from the valueBits, which is a bit representation of the double value
mantissa := 0xfffffffffffff & valueBits
// The bits 52 through 63 are the exponent.
// extract the exponent from the bit representation of the double value.
// Then shift it over by 52 bits (the length of the mantissa) and convert it to an integer
// This also removes the sign bit.
exponent := int((0x7ff0000000000000 & valueBits) >> 52)
// This is for the special case of the subnormal doubles.
if exponent == 0 {
if mantissa < el.firstNormalValueBits {
return int(mantissa)
}
// calculate the number of leading zeros for the mantissa but ignore the first 12 bits (exponent and sign)
nlz := bits.LeadingZeros64(mantissa) - 12
// move it to the double normal form in order to use it like a non-subnormal double in the bucket calculation below.
exponent -= nlz
mantissa <<= (nlz + 1)
mantissa &= 0x000fffffffffffff
}
// find the "rough" bucket index from the indices array
// The indices array has evenly spaced buckets and contains indices into the boundaries array
// The line below transforms the normalized mantissa into an index into the indices array
// Then, the index into the boundaries array is retrieved.
i := el.indices[int(mantissa>>(52-el.precision))]
// The index in the boundaries array might not be correct right away, there might be an offset of a maximum of two buckets.
// Therefore, look at three buckets: The one specified by the index, and the next two.
// in other languages, this can be done in one line with a ternary operator
// e.g. return (exponent << el.precision) + i + (mantissa >= el.boundaries[i] ? 1 : 0) + (mantissa >= el.boundaries[i+1] ? 1 : 0) + el.indexOffset
offset := 0
if mantissa >= el.boundaries[i] {
offset++
}
if mantissa >= el.boundaries[i+1] {
offset++
}
k := i + offset
// the indexOffset is only used to skip subnormal buckets
// the exponent is used to find the correct "top-level" bucket,
// and k is used to find the correct index therein.
return (exponent << el.precision) + k + el.indexOffset
}
func (el exponentialLayout) MapToBinIndex(value float64) int {
valueBits := math.Float64bits(value)
index := el.mapToBinIndexHelper(valueBits)
// if the sign bit is set, the value is negative and the index should be negative.
if (valueBits & 0x8000000000000000) == 0x8000000000000000 {
return -index
}
return index
}
// Copyright 2021 Dynatrace LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"fmt"
"math"
"github.com/dynatrace-oss/dynahist-go-poc/exponential"
)
func main() {
// Print bucket bounds.
// there are 2^precision buckets in the mantissa range between 1 and 2
precision := 3
length := 1 << precision
fmt.Println("------------------------")
fmt.Println("precision:", precision)
fmt.Println("number of buckets:", length)
fmt.Println("------------------------")
for i := -1; i < length; i++ {
fmt.Printf("2^(%d/%d)=%f\n", i+1, length, math.Pow(2, float64(i+1)/float64(length)))
}
fmt.Println("index mapping:")
layout := exponential.NewExponentialLayout(3)
fmt.Printf("% 4.5f\t%d\n", 0.0, layout.MapToBinIndex(0))
fmt.Printf("% 4.5f\t%d\n", 0.0001, layout.MapToBinIndex(0.0001))
fmt.Printf("% 4.5f\t%d\n", 0.01, layout.MapToBinIndex(0.01))
fmt.Printf("% 4.5f\t%d\n", 0.1, layout.MapToBinIndex(0.1))
fmt.Printf("% 4.5f\t%d\n", 1.0, layout.MapToBinIndex(1))
fmt.Printf("% 4.5f\t%d\n", 2.0, layout.MapToBinIndex(2))
fmt.Printf("% 4.5f\t%d\n", 3.0, layout.MapToBinIndex(3))
fmt.Printf("% 4.1f\t\t%d\n", 10.0, layout.MapToBinIndex(10))
fmt.Printf("% 4.1f\t\t%d\n", 100.0, layout.MapToBinIndex(100))
fmt.Printf("% 4.1f\t\t%d\n", 1000.0, layout.MapToBinIndex(1000))
fmt.Printf("%s\t%d\n", "math.SmallestNonzeroFloat64", layout.MapToBinIndex(math.SmallestNonzeroFloat64))
fmt.Printf("%s\t\t\t%d\n", "math.MaxFloat64", layout.MapToBinIndex(math.MaxFloat64))
// ------------------------
// precision: 3
// number of buckets: 8
// ------------------------
// 2^(0/8)=1.000000
// 2^(1/8)=1.090508
// 2^(2/8)=1.189207
// 2^(3/8)=1.296840
// 2^(4/8)=1.414214
// 2^(5/8)=1.542211
// 2^(6/8)=1.681793
// 2^(7/8)=1.834008
// 2^(8/8)=2.000000
//
// index mapping:
// 0.00000 0
// 0.00010 8469
// 0.01000 8522
// 0.10000 8549
// 1.00000 8576
// 2.00000 8584
// 3.00000 8588
// 10.0 8602
// 100.0 8629
// 1000.0 8655
// math.SmallestNonzeroFloat64 1
// math.MaxFloat64 16767
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment