pirgeo/exponential_layout.go

## exponential_layout.go
// Copyright 2021 Dynatrace LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package exponential

import (
	"math"
	"math/bits"
)

type Layout interface {
	MapToIndex(float64) int
}

type exponentialLayout struct {
	// there are 2^precision buckets for the mantissa to map to.
	precision int
  // An array of indices (lookup table). Mantissas of floats are mapped to this array
	// of equidistant buckets, which points to a bucket in the boundaries array
	// that is no further away than two buckets from the correct target bucket.
	indices []int
	// the array of boundaries, represented as the mantissas of the double values in bits.
	boundaries []uint64
	// integer representation of the smallest value that should be treated normally
	// before that every double value has its own bucket.
	// This is the first bit where two doubles can fall into one bucket
	firstNormalValueBits uint64
	// The offset which determines how many indices are skipped for subnormal floats
	indexOffset int
}

func NewExponentialLayout(precision int) *exponentialLayout {
	bounds := calculateBoundaries(precision)
	indices := calculateIndices(bounds, precision)
	fnvb, io := getIndexOffset(precision, indices, bounds)

	return &exponentialLayout{
		precision:            precision,
		indices:              indices,
		boundaries:           bounds,
		firstNormalValueBits: fnvb,
		indexOffset:          io,
	}
}

// calculate boundaries for the mantissa part only.
// This calculates the bucket boundaries independent of the exponent.
// This layout is the same for all mantissas, independent of the exponents.
// It depends only on the desired number of buckets subdividing the [1, 2] range covered by the mantissa.
// The number of buckets is 2^precision
func calculateBoundaries(precision int) []uint64 {
	length := 1 << precision

	boundaries := make([]uint64, length+1)

	for i := 0; i < len(boundaries)-1; i++ {
		boundaries[i] = 0x000fffffffffffff & math.Float64bits(math.Pow(2, float64(i+1)/float64(length)))
	}

	boundaries[length-1] = 0x0010000000000000
	boundaries[length] = 0x0010000000000000

	return boundaries
}

// Create the array which is roughly mapping into the boundaries array
func calculateIndices(boundaries []uint64, precision int) []int {
	length := 1 << precision
	indices := make([]int, length)
	c := 0
	for i := 0; i < length; i++ {
    // e.g. for precision = 2, this evaluates to:
		// i=0: 1.0; i=1: 1.25; i=2: 1.5; i=3: 1.75
		mantissaLowerBound := uint64(i) << (52 - precision)
		// find the lowest boundary that is smaller than or equal to the equidistant bucket bound
		for boundaries[c] <= mantissaLowerBound {
			c++
		}
		indices[i] = c
	}
	return indices
}

// It is possible that in the subnormal range, there are a lot of buckets that cannot have any contents
// because the IEEE double specification cannot be used to represent values in that range. In order to not create
// a large number of extremely small buckets that can never contain any values, this function calculates the point where
// two doubles are mapped to the same index.
// This point is stored and used as an offset later on
func getIndexOffset(precision int, indices []int, boundaries []uint64) (uint64, int) {
	// helper layout to determine the index offset and the first normal value bits
	layout := exponentialLayout{
		precision:            precision,
		indices:              indices,
		boundaries:           boundaries,
		indexOffset:          0,
		firstNormalValueBits: 0,
	}

	var valueBits uint64 = 0
	var index int = math.MinInt64
	for {
		nextValueBits := valueBits + 1
		nextIndex := layout.mapToBinIndexHelper(nextValueBits)
		if index == nextIndex {
			break
		}
		valueBits = nextValueBits
		index = nextIndex
	}
	return valueBits, int(valueBits) - index
}

// This is the code that actually maps the double value to the correct bin.
func (el exponentialLayout) mapToBinIndexHelper(valueBits uint64) int {
	// The last 52 bits (bits 0 through 51) of a double are the mantissa.
	// Get these from the valueBits, which is a bit representation of the double value
	mantissa := 0xfffffffffffff & valueBits

	// The bits 52 through 63 are the exponent.
	// extract the exponent from the bit representation of the double value.
	// Then shift it over by 52 bits (the length of the mantissa) and convert it to an integer
	// This also removes the sign bit.
	exponent := int((0x7ff0000000000000 & valueBits) >> 52)

	// This is for the special case of the subnormal doubles.
	if exponent == 0 {
		if mantissa < el.firstNormalValueBits {
			return int(mantissa)
		}
		// calculate the number of leading zeros for the mantissa but ignore the first 12 bits (exponent and sign)
		nlz := bits.LeadingZeros64(mantissa) - 12

		// move it to the double normal form in order to use it like a non-subnormal double in the bucket calculation below.
		exponent -= nlz
		mantissa <<= (nlz + 1)
		mantissa &= 0x000fffffffffffff
	}

	// find the "rough" bucket index from the indices array
	// The indices array has evenly spaced buckets and contains indices into the boundaries array
	// The line below transforms the normalized mantissa into an index into the indices array
	// Then, the index into the boundaries array is retrieved.
	i := el.indices[int(mantissa>>(52-el.precision))]

	// The index in the boundaries array might not be correct right away, there might be an offset of a maximum of two buckets.
	// Therefore, look at three buckets: The one specified by the index, and the next two.
	// in other languages, this can be done in one line with a ternary operator
	// e.g. return (exponent << el.precision) + i + (mantissa >= el.boundaries[i] ? 1 : 0) + (mantissa >= el.boundaries[i+1] ? 1 : 0) + el.indexOffset
	offset := 0
	if mantissa >= el.boundaries[i] {
		offset++
	}
	if mantissa >= el.boundaries[i+1] {
		offset++
	}
	k := i + offset

	// the indexOffset is only used to skip subnormal buckets
	// the exponent is used to find the correct "top-level" bucket,
	// and k is used to find the correct index therein.
	return (exponent << el.precision) + k + el.indexOffset
}

func (el exponentialLayout) MapToBinIndex(value float64) int {
	valueBits := math.Float64bits(value)

	index := el.mapToBinIndexHelper(valueBits)

	// if the sign bit is set, the value is negative and the index should be negative.
	if (valueBits & 0x8000000000000000) == 0x8000000000000000 {
		return -index
	}
	return index
}

## main.go
// Copyright 2021 Dynatrace LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
	"fmt"
	"math"

	"github.com/dynatrace-oss/dynahist-go-poc/exponential"
)

func main() {
	// Print bucket bounds.
	// there are 2^precision buckets in the mantissa range between 1 and 2
	precision := 3
	length := 1 << precision
	fmt.Println("------------------------")
	fmt.Println("precision:", precision)
	fmt.Println("number of buckets:", length)
	fmt.Println("------------------------")
	for i := -1; i < length; i++ {
		fmt.Printf("2^(%d/%d)=%f\n", i+1, length, math.Pow(2, float64(i+1)/float64(length)))
	}

	fmt.Println("index mapping:")
	layout := exponential.NewExponentialLayout(3)
	fmt.Printf("% 4.5f\t%d\n", 0.0, layout.MapToBinIndex(0))
	fmt.Printf("% 4.5f\t%d\n", 0.0001, layout.MapToBinIndex(0.0001))
	fmt.Printf("% 4.5f\t%d\n", 0.01, layout.MapToBinIndex(0.01))
	fmt.Printf("% 4.5f\t%d\n", 0.1, layout.MapToBinIndex(0.1))
	fmt.Printf("% 4.5f\t%d\n", 1.0, layout.MapToBinIndex(1))
	fmt.Printf("% 4.5f\t%d\n", 2.0, layout.MapToBinIndex(2))
	fmt.Printf("% 4.5f\t%d\n", 3.0, layout.MapToBinIndex(3))
	fmt.Printf("% 4.1f\t\t%d\n", 10.0, layout.MapToBinIndex(10))
	fmt.Printf("% 4.1f\t\t%d\n", 100.0, layout.MapToBinIndex(100))
	fmt.Printf("% 4.1f\t\t%d\n", 1000.0, layout.MapToBinIndex(1000))
	fmt.Printf("%s\t%d\n", "math.SmallestNonzeroFloat64", layout.MapToBinIndex(math.SmallestNonzeroFloat64))
	fmt.Printf("%s\t\t\t%d\n", "math.MaxFloat64", layout.MapToBinIndex(math.MaxFloat64))

	// ------------------------
	// precision: 3
	// number of buckets: 8
	// ------------------------
	// 2^(0/8)=1.000000
	// 2^(1/8)=1.090508
	// 2^(2/8)=1.189207
	// 2^(3/8)=1.296840
	// 2^(4/8)=1.414214
	// 2^(5/8)=1.542211
	// 2^(6/8)=1.681793
	// 2^(7/8)=1.834008
	// 2^(8/8)=2.000000
	//
	// index mapping:
	//  0.00000        0
	//  0.00010        8469
	//  0.01000        8522
	//  0.10000        8549
	//  1.00000        8576
	//  2.00000        8584
	//  3.00000        8588
	//  10.0           8602
	//  100.0          8629
	//  1000.0         8655
	// math.SmallestNonzeroFloat64     1
	// math.MaxFloat64                 16767
}
	// Copyright 2021 Dynatrace LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package exponential

	import (
	"math"
	"math/bits"
	)

	type Layout interface {
	MapToIndex(float64) int
	}

	type exponentialLayout struct {
	// there are 2^precision buckets for the mantissa to map to.
	precision int
	// An array of indices (lookup table). Mantissas of floats are mapped to this array
	// of equidistant buckets, which points to a bucket in the boundaries array
	// that is no further away than two buckets from the correct target bucket.
	indices []int
	// the array of boundaries, represented as the mantissas of the double values in bits.
	boundaries []uint64
	// integer representation of the smallest value that should be treated normally
	// before that every double value has its own bucket.
	// This is the first bit where two doubles can fall into one bucket
	firstNormalValueBits uint64
	// The offset which determines how many indices are skipped for subnormal floats
	indexOffset int
	}

	func NewExponentialLayout(precision int) *exponentialLayout {
	bounds := calculateBoundaries(precision)
	indices := calculateIndices(bounds, precision)
	fnvb, io := getIndexOffset(precision, indices, bounds)

	return &exponentialLayout{
	precision: precision,
	indices: indices,
	boundaries: bounds,
	firstNormalValueBits: fnvb,
	indexOffset: io,
	}
	}

	// calculate boundaries for the mantissa part only.
	// This calculates the bucket boundaries independent of the exponent.
	// This layout is the same for all mantissas, independent of the exponents.
	// It depends only on the desired number of buckets subdividing the [1, 2] range covered by the mantissa.
	// The number of buckets is 2^precision
	func calculateBoundaries(precision int) []uint64 {
	length := 1 << precision

	boundaries := make([]uint64, length+1)

	for i := 0; i < len(boundaries)-1; i++ {
	boundaries[i] = 0x000fffffffffffff & math.Float64bits(math.Pow(2, float64(i+1)/float64(length)))
	}

	boundaries[length-1] = 0x0010000000000000
	boundaries[length] = 0x0010000000000000

	return boundaries
	}

	// Create the array which is roughly mapping into the boundaries array
	func calculateIndices(boundaries []uint64, precision int) []int {
	length := 1 << precision
	indices := make([]int, length)
	c := 0
	for i := 0; i < length; i++ {
	// e.g. for precision = 2, this evaluates to:
	// i=0: 1.0; i=1: 1.25; i=2: 1.5; i=3: 1.75
	mantissaLowerBound := uint64(i) << (52 - precision)
	// find the lowest boundary that is smaller than or equal to the equidistant bucket bound
	for boundaries[c] <= mantissaLowerBound {
	c++
	}
	indices[i] = c
	}
	return indices
	}

	// It is possible that in the subnormal range, there are a lot of buckets that cannot have any contents
	// because the IEEE double specification cannot be used to represent values in that range. In order to not create
	// a large number of extremely small buckets that can never contain any values, this function calculates the point where
	// two doubles are mapped to the same index.
	// This point is stored and used as an offset later on
	func getIndexOffset(precision int, indices []int, boundaries []uint64) (uint64, int) {
	// helper layout to determine the index offset and the first normal value bits
	layout := exponentialLayout{
	precision: precision,
	indices: indices,
	boundaries: boundaries,
	indexOffset: 0,
	firstNormalValueBits: 0,
	}

	var valueBits uint64 = 0
	var index int = math.MinInt64
	for {
	nextValueBits := valueBits + 1
	nextIndex := layout.mapToBinIndexHelper(nextValueBits)
	if index == nextIndex {
	break
	}
	valueBits = nextValueBits
	index = nextIndex
	}
	return valueBits, int(valueBits) - index
	}

	// This is the code that actually maps the double value to the correct bin.
	func (el exponentialLayout) mapToBinIndexHelper(valueBits uint64) int {
	// The last 52 bits (bits 0 through 51) of a double are the mantissa.
	// Get these from the valueBits, which is a bit representation of the double value
	mantissa := 0xfffffffffffff & valueBits

	// The bits 52 through 63 are the exponent.
	// extract the exponent from the bit representation of the double value.
	// Then shift it over by 52 bits (the length of the mantissa) and convert it to an integer
	// This also removes the sign bit.
	exponent := int((0x7ff0000000000000 & valueBits) >> 52)

	// This is for the special case of the subnormal doubles.
	if exponent == 0 {
	if mantissa < el.firstNormalValueBits {
	return int(mantissa)
	}
	// calculate the number of leading zeros for the mantissa but ignore the first 12 bits (exponent and sign)
	nlz := bits.LeadingZeros64(mantissa) - 12

	// move it to the double normal form in order to use it like a non-subnormal double in the bucket calculation below.
	exponent -= nlz
	mantissa <<= (nlz + 1)
	mantissa &= 0x000fffffffffffff
	}

	// find the "rough" bucket index from the indices array
	// The indices array has evenly spaced buckets and contains indices into the boundaries array
	// The line below transforms the normalized mantissa into an index into the indices array
	// Then, the index into the boundaries array is retrieved.
	i := el.indices[int(mantissa>>(52-el.precision))]

	// The index in the boundaries array might not be correct right away, there might be an offset of a maximum of two buckets.
	// Therefore, look at three buckets: The one specified by the index, and the next two.
	// in other languages, this can be done in one line with a ternary operator
	// e.g. return (exponent << el.precision) + i + (mantissa >= el.boundaries[i] ? 1 : 0) + (mantissa >= el.boundaries[i+1] ? 1 : 0) + el.indexOffset
	offset := 0
	if mantissa >= el.boundaries[i] {
	offset++
	}
	if mantissa >= el.boundaries[i+1] {
	offset++
	}
	k := i + offset

	// the indexOffset is only used to skip subnormal buckets
	// the exponent is used to find the correct "top-level" bucket,
	// and k is used to find the correct index therein.
	return (exponent << el.precision) + k + el.indexOffset
	}

	func (el exponentialLayout) MapToBinIndex(value float64) int {
	valueBits := math.Float64bits(value)

	index := el.mapToBinIndexHelper(valueBits)

	// if the sign bit is set, the value is negative and the index should be negative.
	if (valueBits & 0x8000000000000000) == 0x8000000000000000 {
	return -index
	}
	return index
	}