Created
July 13, 2021 15:34
-
-
Save pirgeo/9f713e486a1b1e8e479dc163456b835d to your computer and use it in GitHub Desktop.
Mapping to histogram buckets in an exponential layout
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2021 Dynatrace LLC | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
package exponential | |
import ( | |
"math" | |
"math/bits" | |
) | |
type Layout interface { | |
MapToIndex(float64) int | |
} | |
type exponentialLayout struct { | |
// there are 2^precision buckets for the mantissa to map to. | |
precision int | |
// An array of indices (lookup table). Mantissas of floats are mapped to this array | |
// of equidistant buckets, which points to a bucket in the boundaries array | |
// that is no further away than two buckets from the correct target bucket. | |
indices []int | |
// the array of boundaries, represented as the mantissas of the double values in bits. | |
boundaries []uint64 | |
// integer representation of the smallest value that should be treated normally | |
// before that every double value has its own bucket. | |
// This is the first bit where two doubles can fall into one bucket | |
firstNormalValueBits uint64 | |
// The offset which determines how many indices are skipped for subnormal floats | |
indexOffset int | |
} | |
func NewExponentialLayout(precision int) *exponentialLayout { | |
bounds := calculateBoundaries(precision) | |
indices := calculateIndices(bounds, precision) | |
fnvb, io := getIndexOffset(precision, indices, bounds) | |
return &exponentialLayout{ | |
precision: precision, | |
indices: indices, | |
boundaries: bounds, | |
firstNormalValueBits: fnvb, | |
indexOffset: io, | |
} | |
} | |
// calculate boundaries for the mantissa part only. | |
// This calculates the bucket boundaries independent of the exponent. | |
// This layout is the same for all mantissas, independent of the exponents. | |
// It depends only on the desired number of buckets subdividing the [1, 2] range covered by the mantissa. | |
// The number of buckets is 2^precision | |
func calculateBoundaries(precision int) []uint64 { | |
length := 1 << precision | |
boundaries := make([]uint64, length+1) | |
for i := 0; i < len(boundaries)-1; i++ { | |
boundaries[i] = 0x000fffffffffffff & math.Float64bits(math.Pow(2, float64(i+1)/float64(length))) | |
} | |
boundaries[length-1] = 0x0010000000000000 | |
boundaries[length] = 0x0010000000000000 | |
return boundaries | |
} | |
// Create the array which is roughly mapping into the boundaries array | |
func calculateIndices(boundaries []uint64, precision int) []int { | |
length := 1 << precision | |
indices := make([]int, length) | |
c := 0 | |
for i := 0; i < length; i++ { | |
// e.g. for precision = 2, this evaluates to: | |
// i=0: 1.0; i=1: 1.25; i=2: 1.5; i=3: 1.75 | |
mantissaLowerBound := uint64(i) << (52 - precision) | |
// find the lowest boundary that is smaller than or equal to the equidistant bucket bound | |
for boundaries[c] <= mantissaLowerBound { | |
c++ | |
} | |
indices[i] = c | |
} | |
return indices | |
} | |
// It is possible that in the subnormal range, there are a lot of buckets that cannot have any contents | |
// because the IEEE double specification cannot be used to represent values in that range. In order to not create | |
// a large number of extremely small buckets that can never contain any values, this function calculates the point where | |
// two doubles are mapped to the same index. | |
// This point is stored and used as an offset later on | |
func getIndexOffset(precision int, indices []int, boundaries []uint64) (uint64, int) { | |
// helper layout to determine the index offset and the first normal value bits | |
layout := exponentialLayout{ | |
precision: precision, | |
indices: indices, | |
boundaries: boundaries, | |
indexOffset: 0, | |
firstNormalValueBits: 0, | |
} | |
var valueBits uint64 = 0 | |
var index int = math.MinInt64 | |
for { | |
nextValueBits := valueBits + 1 | |
nextIndex := layout.mapToBinIndexHelper(nextValueBits) | |
if index == nextIndex { | |
break | |
} | |
valueBits = nextValueBits | |
index = nextIndex | |
} | |
return valueBits, int(valueBits) - index | |
} | |
// This is the code that actually maps the double value to the correct bin. | |
func (el exponentialLayout) mapToBinIndexHelper(valueBits uint64) int { | |
// The last 52 bits (bits 0 through 51) of a double are the mantissa. | |
// Get these from the valueBits, which is a bit representation of the double value | |
mantissa := 0xfffffffffffff & valueBits | |
// The bits 52 through 63 are the exponent. | |
// extract the exponent from the bit representation of the double value. | |
// Then shift it over by 52 bits (the length of the mantissa) and convert it to an integer | |
// This also removes the sign bit. | |
exponent := int((0x7ff0000000000000 & valueBits) >> 52) | |
// This is for the special case of the subnormal doubles. | |
if exponent == 0 { | |
if mantissa < el.firstNormalValueBits { | |
return int(mantissa) | |
} | |
// calculate the number of leading zeros for the mantissa but ignore the first 12 bits (exponent and sign) | |
nlz := bits.LeadingZeros64(mantissa) - 12 | |
// move it to the double normal form in order to use it like a non-subnormal double in the bucket calculation below. | |
exponent -= nlz | |
mantissa <<= (nlz + 1) | |
mantissa &= 0x000fffffffffffff | |
} | |
// find the "rough" bucket index from the indices array | |
// The indices array has evenly spaced buckets and contains indices into the boundaries array | |
// The line below transforms the normalized mantissa into an index into the indices array | |
// Then, the index into the boundaries array is retrieved. | |
i := el.indices[int(mantissa>>(52-el.precision))] | |
// The index in the boundaries array might not be correct right away, there might be an offset of a maximum of two buckets. | |
// Therefore, look at three buckets: The one specified by the index, and the next two. | |
// in other languages, this can be done in one line with a ternary operator | |
// e.g. return (exponent << el.precision) + i + (mantissa >= el.boundaries[i] ? 1 : 0) + (mantissa >= el.boundaries[i+1] ? 1 : 0) + el.indexOffset | |
offset := 0 | |
if mantissa >= el.boundaries[i] { | |
offset++ | |
} | |
if mantissa >= el.boundaries[i+1] { | |
offset++ | |
} | |
k := i + offset | |
// the indexOffset is only used to skip subnormal buckets | |
// the exponent is used to find the correct "top-level" bucket, | |
// and k is used to find the correct index therein. | |
return (exponent << el.precision) + k + el.indexOffset | |
} | |
func (el exponentialLayout) MapToBinIndex(value float64) int { | |
valueBits := math.Float64bits(value) | |
index := el.mapToBinIndexHelper(valueBits) | |
// if the sign bit is set, the value is negative and the index should be negative. | |
if (valueBits & 0x8000000000000000) == 0x8000000000000000 { | |
return -index | |
} | |
return index | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2021 Dynatrace LLC | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
package main | |
import ( | |
"fmt" | |
"math" | |
"github.com/dynatrace-oss/dynahist-go-poc/exponential" | |
) | |
func main() { | |
// Print bucket bounds. | |
// there are 2^precision buckets in the mantissa range between 1 and 2 | |
precision := 3 | |
length := 1 << precision | |
fmt.Println("------------------------") | |
fmt.Println("precision:", precision) | |
fmt.Println("number of buckets:", length) | |
fmt.Println("------------------------") | |
for i := -1; i < length; i++ { | |
fmt.Printf("2^(%d/%d)=%f\n", i+1, length, math.Pow(2, float64(i+1)/float64(length))) | |
} | |
fmt.Println("index mapping:") | |
layout := exponential.NewExponentialLayout(3) | |
fmt.Printf("% 4.5f\t%d\n", 0.0, layout.MapToBinIndex(0)) | |
fmt.Printf("% 4.5f\t%d\n", 0.0001, layout.MapToBinIndex(0.0001)) | |
fmt.Printf("% 4.5f\t%d\n", 0.01, layout.MapToBinIndex(0.01)) | |
fmt.Printf("% 4.5f\t%d\n", 0.1, layout.MapToBinIndex(0.1)) | |
fmt.Printf("% 4.5f\t%d\n", 1.0, layout.MapToBinIndex(1)) | |
fmt.Printf("% 4.5f\t%d\n", 2.0, layout.MapToBinIndex(2)) | |
fmt.Printf("% 4.5f\t%d\n", 3.0, layout.MapToBinIndex(3)) | |
fmt.Printf("% 4.1f\t\t%d\n", 10.0, layout.MapToBinIndex(10)) | |
fmt.Printf("% 4.1f\t\t%d\n", 100.0, layout.MapToBinIndex(100)) | |
fmt.Printf("% 4.1f\t\t%d\n", 1000.0, layout.MapToBinIndex(1000)) | |
fmt.Printf("%s\t%d\n", "math.SmallestNonzeroFloat64", layout.MapToBinIndex(math.SmallestNonzeroFloat64)) | |
fmt.Printf("%s\t\t\t%d\n", "math.MaxFloat64", layout.MapToBinIndex(math.MaxFloat64)) | |
// ------------------------ | |
// precision: 3 | |
// number of buckets: 8 | |
// ------------------------ | |
// 2^(0/8)=1.000000 | |
// 2^(1/8)=1.090508 | |
// 2^(2/8)=1.189207 | |
// 2^(3/8)=1.296840 | |
// 2^(4/8)=1.414214 | |
// 2^(5/8)=1.542211 | |
// 2^(6/8)=1.681793 | |
// 2^(7/8)=1.834008 | |
// 2^(8/8)=2.000000 | |
// | |
// index mapping: | |
// 0.00000 0 | |
// 0.00010 8469 | |
// 0.01000 8522 | |
// 0.10000 8549 | |
// 1.00000 8576 | |
// 2.00000 8584 | |
// 3.00000 8588 | |
// 10.0 8602 | |
// 100.0 8629 | |
// 1000.0 8655 | |
// math.SmallestNonzeroFloat64 1 | |
// math.MaxFloat64 16767 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment