legends2k/minifloat.cpp

## minifloat.cpp
#include <iostream>
#include <bitset>
#include <cstdint>
#include <iomanip>
#include <limits>
#include <sstream>
#include <cmath>

/*
 * 8-bit Minifloat mimicing binary32 of IEEE 754/IEC 60559
 * Prints the distribution of minifloat 1.4.3.−7 (sign.exponent.mantissa.bias)
 * on the real number line, skipping negatives as the distribution is symmetric.
 *
 * Exponent interval: [0, 15], bias = −7
 * Special exponents: 0 – Denormals, 15 – Inf/NaN
 * Normalized exponents = [1, 14] ⇒ Normalized unbiased exponents = [−6, 7]
 * for denormals, implicit bit = 0 (hence the name sub/de-normals)
 *                exponent = min exp + 1; −6 for minifloats
 * minifloat: S EEEE MMM
 * binary32:  S EEEEEEEE MMMMMMMMMMMMMMMMMMMMMMM
 *
 *
 * When devising an exponential notation number system with, say, four digits
 * for the base number and two digits for the exponent, one could represent
 * numbers from 9.999e⁹⁹ to −9.999e⁹⁹ and as small as 1e⁻⁹⁹ or, if you cheat,
 * 0.001e⁻⁹⁹. You can now represent both much larger numbers and much smaller;
 * the tradeoff is that the absolute resolution is no longer constant, and gets
 * smaller as the absolute value of the numbers gets larger. The number 123.456
 * can only be represented as 123.4, and the number 123456 can only be
 * represented as 123400. You can't represent 999.999; you have to settle for
 * 999.9 (9.999e²) or 1000 (1.000e³). You can't distinguish between 999.998 and
 * 999.999 any more[1].
 *
 * Precision can be absolute or relative. Integer types have an absolute
 * precision of 1. Every integer within the type's range is represented. Fixed
 * point types also have an absolute precision. A fixed point number with scale
 * of 1⁄1000 can represent numbers from 999.999 to −999.999, and as small as
 * 0.001. It'd have a resolution of 0.001 everywhere: it could represent 0.001
 * and 0.002, as well as 999.998 and 999.999. Floating point formats use
 * relative precision. This means that the precision is constant relative to the
 * size of the number. For example, 1.3331, 1.3331e+5 = 133310, and 1.3331e⁻³ =
 * 0.0013331 all have 5 decimal digits of relative precision[2].
 *
 * Floating-point number representation trades range for precision. Precision is
 * the number of most significant non-zero digits used in representing a number;
 * the radix point doesn't play any role in this[3]. Given this definition, how
 * do we agree that the trade-off is between range and precision. Doesn't the
 * number of digits remain constant as the radix point floats around? It does,
 * but the number of representable numbers aren't constant. Given a number
 * closer to 0 where the precision is highest (near 2⁰, not near the end of the
 * range where it's the lowest i.e. near 2^max_exp), say 0.1234567 having 7
 * digits of precision may be represented comfortably while 123456.7, again
 * having the same 7 digits of precision, may not be representable in this
 * system. The significand that represents 1234567 itself will still be the same
 * while the exponent field may not be large enough to move the radix point that
 * far so that this number may not be in the representable set. Bruce Dawson's
 * article[4] on precision treats precision of FP in great detail.
 *
 * Resolution is calculated using 2^(k−n), where n is the number of mantiassa
 * bits excluding the implicit bit and k is the unbiased exponent. As n is fixed
 * for a give FP type, resolution varies with k; all numbers within 2^k and
 * 2^(k+1) — binade — have the same resolution. The rounding error of each
 * floating-point operation is at most half of this resolution[5][6]. This is
 * also called the ULP[7] — Weight of the last bit of the significand[8] —
 * acronym for unit in the last place, was coined by William Kahan in 1960. The
 * original definition is: ulp(x) is the gap between the two floating-point
 * numbers nearest to x, even if x is one of them. ULP is also given by ε × 2^k,
 * where ε is the machine epsilon of a given type which is the difference
 * between 1 and the next representable number in that system, ε = 2⁻ⁿ.
 * For this minifloat, ε = 1.001 − 1 = 0.001 = ⅛ = 2⁻³[9].
 *
 * [1]: http://www.eskimo.com/~scs/cclass/progintro/sx5.html
 * [2]: http://www.csharphelp.com/2007/07/
 *      floating-point-in-c-part-i-concepts-and-formats/
 * [3]: http://stackoverflow.com/q/13103304
 * [4]: http://randomascii.wordpress.com/2012/03/08/
 *      float-precisionfrom-zero-to-100-digits-2/
 * [5]: http://stackoverflow.com/a/30537881
 * [6]: http://stackoverflow.com/a/30538124
 * [7]: http://stackoverflow.com/questions/30537330/is-floating-point-
 *      precision-mutable-or-invariant/30537881#comment52415637_30537881
 * [8]: §2.6.1, Handbook of Floating-Point Arithmetic
 * [9]: Chapter 3, Numerical Computing with IEEE Floating-Point Arithmetic
 *
 * SEE ALSO:
 * blog.reverberate.org/2014/09/what-every-computer-programmer-should.html
 * stackoverflow.com/q/872544
 */

static_assert(std::numeric_limits<float>::is_iec559,
              "float is not IEEE 754's binary32!");

#if defined(_MSC_VER) && (_MSC_VER <= 1800)
#  define constexpr const
#endif

auto constexpr fw = 15;     // width of 0.001953125
struct field {
    int width;
    explicit field(int width) : width(width) { }
};
std::ostream& operator<<(std::ostream &os, field f) {
    return os << std::left << std::setw(f.width);
}

void print(unsigned e, unsigned m) {
    // enable this to get the minifloat's bit pattern
    // std::bitset<8> const b = m | (e << 3u);
    bool const denormal = (!e && m);
    // inf, denormal, regular and zero cases handled in that order
    // −7 to get the actual exponent and +127 to make binary32 biased exponent
    // if it's a minifloat inf/nan make it a binary32 inf/nan
    e = (e || m) ? ((e >= 0xf) ? 0xff : ((denormal ? 1 : e) - 7 + 127)) : 0u;
    uint32_t i = (m << 20u) | (e << 23u);
    float f;
    memcpy(&f, &i, sizeof(i));
    // what is denormal for minifloat isn't denormal for binary32; maintain 2^−6
    // as exponent but subtract 1.0 * 2^−6 to turn implicit 1 into 0
    f -= denormal ? 0.015625f : 0.0f;
    std::cout << field(fw) << f;
}

int main() {
    std::cout << "Minifloat 1.4.3.−7\n";
    std::cout << std::setprecision(std::numeric_limits<float>::max_digits10);
    std::cout << field(fw) << "E v \\ M >";
    for (auto i = 0; i < 8; ++i) {
        std::ostringstream ss("0eeee", std::ios::ate);
        ss << std::bitset<3>(i);
        auto const s = ss.str();
        std::cout << field(fw) << s;
    }
    std::cout << field(fw) << "|RESOLUTION" << '\n';
    std::cout << field(fw) << "00000mmm*";
    // the set of numbers in a binary IEEE 754 floating-point format that all
    // have the same exponent is a binade; it is the interval [2ⁿ, 2ⁿ⁺¹) for
    // some value of n. Print 3 sets of binades: subnormals, normals, inf/nans
    unsigned e = 0u;
    for (unsigned m = 0u; m < 8; ++m)
        print(e, m);
    float k = -6.0f;            // minimum normal exponent
    std::cout << '|' << field(fw) << std::exp2(k - 3.0f);
    for (e = 1; e < 15; ++e) {
        std::ostringstream ss("0", std::ios::ate);
        ss << std::bitset<4>(e) << "mmm";
        auto const s = ss.str();
        std::cout << '\n' << field(fw) << s;
        for (unsigned m = 0u; m < 8; ++m)
            print(e, m);
        std::cout << '|' << field(fw) << std::exp2(k++ - 3.0f);
    }
    std::cout << '\n' << field(fw) << "01111mmm";
    for (unsigned m = 0u; m < 8; ++m)
        print(e, m);
    std::cout << field(fw) << "|N/A";
    std::cout << "\n* denormals\n\n";
    // for a pictorial representation of a similar minifloat's distribution see
    // http://cs.smith.edu/dftwiki/index.php/
    // CSC231_An_Introduction_to_Fixed-_and_Floating-Point_Numbers
    // #The_gap_between_floats_is_not_constant.2C_as_with_Fixed-Point
    std::cout << "Max representable integer = max representable value = "
        "1.111 × 2^m = (2^(k+1) − 1) × 2^(m − k) = 2^(m + 1) − 2^(m − k)"
        "= 2⁸ − 2⁴ = 240\nwhere maximum normal exponent, m = 7;"
        " significand bits excluding hidden, k = 3\n";
    // §1.5.2, Essential Math, 2nd Edition
    // http://stackoverflow.com/a/1848762
    std::cout << "Maximum continuously representable integer from 0 = "
        "2^(k + 1) = 2⁴ = 16\n";
    // this is similar to the range calculation of integral data types; n bits
    // can represent 2ⁿ states; here n = k + 1 significand bits. However there
    // is a catch: integral types cannot represent the number 2ⁿ while floats
    // can: http://stackoverflow.com/a/23031245
}
	#include <iostream>
	#include <bitset>
	#include <cstdint>
	#include <iomanip>
	#include <limits>
	#include <sstream>
	#include <cmath>

	/*
	* 8-bit Minifloat mimicing binary32 of IEEE 754/IEC 60559
	* Prints the distribution of minifloat 1.4.3.−7 (sign.exponent.mantissa.bias)
	* on the real number line, skipping negatives as the distribution is symmetric.
	*
	* Exponent interval: [0, 15], bias = −7
	* Special exponents: 0 – Denormals, 15 – Inf/NaN
	* Normalized exponents = [1, 14] ⇒ Normalized unbiased exponents = [−6, 7]
	* for denormals, implicit bit = 0 (hence the name sub/de-normals)
	* exponent = min exp + 1; −6 for minifloats
	* minifloat: S EEEE MMM
	* binary32: S EEEEEEEE MMMMMMMMMMMMMMMMMMMMMMM
	*
	*
	* When devising an exponential notation number system with, say, four digits
	* for the base number and two digits for the exponent, one could represent
	* numbers from 9.999e⁹⁹ to −9.999e⁹⁹ and as small as 1e⁻⁹⁹ or, if you cheat,
	* 0.001e⁻⁹⁹. You can now represent both much larger numbers and much smaller;
	* the tradeoff is that the absolute resolution is no longer constant, and gets
	* smaller as the absolute value of the numbers gets larger. The number 123.456
	* can only be represented as 123.4, and the number 123456 can only be
	* represented as 123400. You can't represent 999.999; you have to settle for
	* 999.9 (9.999e²) or 1000 (1.000e³). You can't distinguish between 999.998 and
	* 999.999 any more[1].
	*
	* Precision can be absolute or relative. Integer types have an absolute
	* precision of 1. Every integer within the type's range is represented. Fixed
	* point types also have an absolute precision. A fixed point number with scale
	* of 1⁄1000 can represent numbers from 999.999 to −999.999, and as small as
	* 0.001. It'd have a resolution of 0.001 everywhere: it could represent 0.001
	* and 0.002, as well as 999.998 and 999.999. Floating point formats use
	* relative precision. This means that the precision is constant relative to the
	* size of the number. For example, 1.3331, 1.3331e+5 = 133310, and 1.3331e⁻³ =
	* 0.0013331 all have 5 decimal digits of relative precision[2].
	*
	* Floating-point number representation trades range for precision. Precision is
	* the number of most significant non-zero digits used in representing a number;
	* the radix point doesn't play any role in this[3]. Given this definition, how
	* do we agree that the trade-off is between range and precision. Doesn't the
	* number of digits remain constant as the radix point floats around? It does,
	* but the number of representable numbers aren't constant. Given a number
	* closer to 0 where the precision is highest (near 2⁰, not near the end of the
	* range where it's the lowest i.e. near 2^max_exp), say 0.1234567 having 7
	* digits of precision may be represented comfortably while 123456.7, again
	* having the same 7 digits of precision, may not be representable in this
	* system. The significand that represents 1234567 itself will still be the same
	* while the exponent field may not be large enough to move the radix point that
	* far so that this number may not be in the representable set. Bruce Dawson's
	* article[4] on precision treats precision of FP in great detail.
	*
	* Resolution is calculated using 2^(k−n), where n is the number of mantiassa
	* bits excluding the implicit bit and k is the unbiased exponent. As n is fixed
	* for a give FP type, resolution varies with k; all numbers within 2^k and
	* 2^(k+1) — binade — have the same resolution. The rounding error of each
	* floating-point operation is at most half of this resolution[5][6]. This is
	* also called the ULP[7] — Weight of the last bit of the significand[8] —
	* acronym for unit in the last place, was coined by William Kahan in 1960. The
	* original definition is: ulp(x) is the gap between the two floating-point
	* numbers nearest to x, even if x is one of them. ULP is also given by ε × 2^k,
	* where ε is the machine epsilon of a given type which is the difference
	* between 1 and the next representable number in that system, ε = 2⁻ⁿ.
	* For this minifloat, ε = 1.001 − 1 = 0.001 = ⅛ = 2⁻³[9].
	*
	* [1]: http://www.eskimo.com/~scs/cclass/progintro/sx5.html
	* [2]: http://www.csharphelp.com/2007/07/
	* floating-point-in-c-part-i-concepts-and-formats/
	* [3]: http://stackoverflow.com/q/13103304
	* [4]: http://randomascii.wordpress.com/2012/03/08/
	* float-precisionfrom-zero-to-100-digits-2/
	* [5]: http://stackoverflow.com/a/30537881
	* [6]: http://stackoverflow.com/a/30538124
	* [7]: http://stackoverflow.com/questions/30537330/is-floating-point-
	* precision-mutable-or-invariant/30537881#comment52415637_30537881
	* [8]: §2.6.1, Handbook of Floating-Point Arithmetic
	* [9]: Chapter 3, Numerical Computing with IEEE Floating-Point Arithmetic
	*
	* SEE ALSO:
	* blog.reverberate.org/2014/09/what-every-computer-programmer-should.html
	* stackoverflow.com/q/872544
	*/

	static_assert(std::numeric_limits<float>::is_iec559,
	"float is not IEEE 754's binary32!");

	#if defined(_MSC_VER) && (_MSC_VER <= 1800)
	# define constexpr const
	#endif

	auto constexpr fw = 15; // width of 0.001953125
	struct field {
	int width;
	explicit field(int width) : width(width) { }
	};
	std::ostream& operator<<(std::ostream &os, field f) {
	return os << std::left << std::setw(f.width);
	}

	void print(unsigned e, unsigned m) {
	// enable this to get the minifloat's bit pattern
	// std::bitset<8> const b = m \| (e << 3u);
	bool const denormal = (!e && m);
	// inf, denormal, regular and zero cases handled in that order
	// −7 to get the actual exponent and +127 to make binary32 biased exponent
	// if it's a minifloat inf/nan make it a binary32 inf/nan
	e = (e \|\| m) ? ((e >= 0xf) ? 0xff : ((denormal ? 1 : e) - 7 + 127)) : 0u;
	uint32_t i = (m << 20u) \| (e << 23u);
	float f;
	memcpy(&f, &i, sizeof(i));
	// what is denormal for minifloat isn't denormal for binary32; maintain 2^−6
	// as exponent but subtract 1.0 * 2^−6 to turn implicit 1 into 0
	f -= denormal ? 0.015625f : 0.0f;
	std::cout << field(fw) << f;
	}

	int main() {
	std::cout << "Minifloat 1.4.3.−7\n";
	std::cout << std::setprecision(std::numeric_limits<float>::max_digits10);
	std::cout << field(fw) << "E v \\ M >";
	for (auto i = 0; i < 8; ++i) {
	std::ostringstream ss("0eeee", std::ios::ate);
	ss << std::bitset<3>(i);
	auto const s = ss.str();
	std::cout << field(fw) << s;
	}
	std::cout << field(fw) << "\|RESOLUTION" << '\n';
	std::cout << field(fw) << "00000mmm*";
	// the set of numbers in a binary IEEE 754 floating-point format that all
	// have the same exponent is a binade; it is the interval [2ⁿ, 2ⁿ⁺¹) for
	// some value of n. Print 3 sets of binades: subnormals, normals, inf/nans
	unsigned e = 0u;
	for (unsigned m = 0u; m < 8; ++m)
	print(e, m);
	float k = -6.0f; // minimum normal exponent
	std::cout << '\|' << field(fw) << std::exp2(k - 3.0f);
	for (e = 1; e < 15; ++e) {
	std::ostringstream ss("0", std::ios::ate);
	ss << std::bitset<4>(e) << "mmm";
	auto const s = ss.str();
	std::cout << '\n' << field(fw) << s;
	for (unsigned m = 0u; m < 8; ++m)
	print(e, m);
	std::cout << '\|' << field(fw) << std::exp2(k++ - 3.0f);
	}
	std::cout << '\n' << field(fw) << "01111mmm";
	for (unsigned m = 0u; m < 8; ++m)
	print(e, m);
	std::cout << field(fw) << "\|N/A";
	std::cout << "\n* denormals\n\n";
	// for a pictorial representation of a similar minifloat's distribution see
	// http://cs.smith.edu/dftwiki/index.php/
	// CSC231_An_Introduction_to_Fixed-_and_Floating-Point_Numbers
	// #The_gap_between_floats_is_not_constant.2C_as_with_Fixed-Point
	std::cout << "Max representable integer = max representable value = "
	"1.111 × 2^m = (2^(k+1) − 1) × 2^(m − k) = 2^(m + 1) − 2^(m − k)"
	"= 2⁸ − 2⁴ = 240\nwhere maximum normal exponent, m = 7;"
	" significand bits excluding hidden, k = 3\n";
	// §1.5.2, Essential Math, 2nd Edition
	// http://stackoverflow.com/a/1848762
	std::cout << "Maximum continuously representable integer from 0 = "
	"2^(k + 1) = 2⁴ = 16\n";
	// this is similar to the range calculation of integral data types; n bits
	// can represent 2ⁿ states; here n = k + 1 significand bits. However there
	// is a catch: integral types cannot represent the number 2ⁿ while floats
	// can: http://stackoverflow.com/a/23031245
	}