Skip to content

Instantly share code, notes, and snippets.

@rLinks234
Last active May 15, 2024 03:52
Show Gist options
  • Save rLinks234/fa45b1cf1045f20a4606ca5a1e51254d to your computer and use it in GitHub Desktop.
Save rLinks234/fa45b1cf1045f20a4606ca5a1e51254d to your computer and use it in GitHub Desktop.
A Simple but very efficient single precision float to string implementation
///
/// Author: Daniel Wright
/// Influenced by: https://github.com/vinceallenvince/floatToString/blob/master/floatToString.h
///
/// Simple demo involving a custom float to string implementation
/// Should be portable, but cannot process IEEE-754 double precision numbers
/// ^^ this can only handle single (float32) floating point numbers.
///
/// Decently accurate compared to sprintf, but I don't think it is going to be
/// nearly as rigorous.
///
/// Appears to be about ~5 times faster than glibc's sprintf("%f", number)
/// implementation, but that is at least on the machine tested. The performance
/// advantage seems to diminish slightly for numbers
/// with a very large magnitude (> (+/-)10^12)
///
/// compiled with following command:
/// gcc -O3 -march=native -malign-data=cache -fomit-frame-pointer floatToString.cpp -o floatToString -lstdc++
///
/// used gcc version 6.1.1 on Arch Linux (Linux 4.7.0 (Intel x64))
///
#include <stdint.h>
#include <stdio.h>
#include <cstdlib>
#include <ctime>
#include <cmath>
/// Random huge number to test how much divergence from
/// glibc's sprintf the implementation will acheieve
#define HI 1.6212959e+17
#define LO -HI
/// 250 thousand iterations, takes about a few seconds.
/// Covers a good amount of ground computationally for
/// just a simple benchmark.
#define ITERATIONS_PERF 250000
/// Used in function below to print side-by-side outputs
/// from both libc's sprintf and the custom implementation
#define ITERATIONS_LOGICAL 33
/// Used for char buffer
#define BUFFERSIZE 4096
/// Change to refine precision
#define NUM_DECIMAL_PLACES 15
/// ^
#define SPRINTF_STR "%15.15f"
/// Our ***very*** constrained digit-to-char function
static inline void to_digit_char_unsafe(int8_t digit, char* out) {
out[0] = '0' + digit;
}
/// lossy floating point values probably not a big deal.
/// Enabled it since some platforms (e.g. ARM) can't use
/// some SIMD instructions (i.e. NEON) without relaxed fp.
/// Should only affect infinities, but I'm not entirely sure.
#pragma FP_CONTRACT fast
#define __FAST_MATH__ 1
/// Instead of calculating powers of 10 (and their reciprocals),
/// it is computationally quicker to have tables of these values
/// to which the code can look. Only goes up to 10^38 (and 10^-38)
/// since that is the order of magnitude of the maximum value for
/// an IEEE-754 32 bit floating point number.
/// See: https://en.wikipedia.org/wiki/Single-precision_floating-point_format
///
/// Note that a double (64-bit) has a maximum on the order of 10^308.
/// Feel free to implement that, but I didn't really have a demand for
/// implementing this for double precision as well
/// (float is enough precision for my use).
static const float __inv_pow_10[] = {
1.0f, // 0
0.1f, // 1
0.01f, // 2
0.001f, // 3
0.0001f, // 4
0.00001f, // 5
0.000001f, // 6
0.0000001f, // 7
0.00000001f, // 8
0.000000001f, // 9
0.0000000001f, // 10
0.00000000001f, // 11
0.000000000001f, // 12
0.0000000000001f, // 13
0.00000000000001f, // 14
0.000000000000001f, // 15
0.0000000000000001f, // 16
0.00000000000000001f, // 17
0.000000000000000001f, // 18
0.0000000000000000001f, // 19
0.00000000000000000001f, // 20
0.000000000000000000001f, // 21
0.0000000000000000000001f, // 22
0.00000000000000000000001f, // 23
0.000000000000000000000001f, // 24
0.0000000000000000000000001f, // 25
0.00000000000000000000000001f, // 26
0.000000000000000000000000001f, // 27
0.0000000000000000000000000001f, // 28
0.00000000000000000000000000001f, // 29
0.000000000000000000000000000001f, // 30
0.0000000000000000000000000000001f, // 31
0.00000000000000000000000000000001f, // 32
0.000000000000000000000000000000001f, // 33
0.0000000000000000000000000000000001f, // 34
0.00000000000000000000000000000000001f, // 35
0.000000000000000000000000000000000001f, // 36
0.00000000000000000000000000000000000001f, // 37
0.000000000000000000000000000000000000001f // 38
};
/// Same concept as above, but positive exponents of 10.
static const float __pos_pow_10[] = {
1.0f, // 0
10.0f, // 1
100.0f, // 2
1000.0f, // 3
10000.0f, // 4
100000.0f, // 5
1000000.0f, // 6
10000000.0f, // 7
100000000.0f, // 8
1000000000.0f, // 9
10000000000.0f, // 10
100000000000.0f, // 11
1000000000000.0f, // 12
10000000000000.0f, // 13
100000000000000.0f, // 14
1000000000000000.0f, // 15
10000000000000000.0f, // 16
100000000000000000.0f, // 17
1000000000000000000.0f, // 18
10000000000000000000.0f, // 19
100000000000000000000.0f, // 20
1000000000000000000000.0f, // 21
10000000000000000000000.0f, // 22
100000000000000000000000.0f, // 23
1000000000000000000000000.0f, // 24
10000000000000000000000000.0f, // 25
100000000000000000000000000.0f, // 26
1000000000000000000000000000.0f, // 27
10000000000000000000000000000.0f, // 28
100000000000000000000000000000.0f, // 29
1000000000000000000000000000000.0f, // 30
10000000000000000000000000000000.0f, // 31
100000000000000000000000000000000.0f, // 32
1000000000000000000000000000000000.0f, // 33
10000000000000000000000000000000000.0f, // 34
100000000000000000000000000000000000.0f, // 35
1000000000000000000000000000000000000.0f, // 36
10000000000000000000000000000000000000.0f, // 37
100000000000000000000000000000000000000.0f // 38
};
/// Offset pointers for calculation below (to save a few instructions)
static const float* const __inv_pow_10p = &(__inv_pow_10[-1]);
static const float* const __pos_pow_10o = &(__pos_pow_10[1]);
static const float* const __pos_pow_10p = &(__pos_pow_10[-1]);
static size_t floatToString(char* outstr, float value, size_t places=NUM_DECIMAL_PLACES) {
/// Used to store current digit for calculations below.
/// That means the possible values for this byte are [0, 9]
int8_t digit;
/// The order of magnitude on which the input resides
/// i.e. tenscount = (int)log10f(abs(value))
/// ^^ dont use log10f here -- tried and it's miserably slow :-)
size_t tenscount = 0;
/// counter for loops below
size_t i;
/// temp variable for calculations below
float tempfloat = value;
/// the number of characters consumed by the string representing this float
/// (this is what is returned)
size_t c = 0;
/// calculate rounding term d: 0.5/pow(10,places)
float d = (value < 0) ? -0.5f : 0.5f;
/// divide by ten for each decimal place
d = d * __inv_pow_10[places];
/// this small addition, combined with truncation will round our values properly
if (value < 0) {
tempfloat = -(value + d);
} else {
tempfloat = value + d;
}
/// This logic is similar to
/// tenscount = (int)log10f(tempfloat)
/// but is not miserably slow.
while (__pos_pow_10o[tenscount++] <= tempfloat) {}
/// the number is negative
if (value < 0) {
outstr[c++] = '-';
}
if (tenscount == 0) {
outstr[c++] = '0';
}
#pragma nounroll
for (i = 0; i < tenscount; i++) {
const ssize_t idx = tenscount-i;
digit = (int8_t) (tempfloat * __inv_pow_10p[idx]);
to_digit_char_unsafe(digit, &outstr[c++]);
tempfloat = tempfloat - ((float)digit * __pos_pow_10p[idx]);
}
/// if places is zero, then there is no decimal part
if (places > 0) {
outstr[c++] = '.';
}
#pragma nounroll
for (i = 0; i < places; i++) {
tempfloat *= 10.0f;
digit = (int8_t) tempfloat;
/// convert digit to character
/// If you replace this function with sprintf("%d", digit)
/// you will see where sprintf most likely spends most of
/// its time during their implementation of this routine :-)
///
/// Note that if for some reason digit is not within [0,9],
/// your number will be a flaming pile of shit
to_digit_char_unsafe(digit, &outstr[c++]);
tempfloat = tempfloat - (float) digit;
}
outstr[c++] = '\0';
return c;
}
static inline float getRandom() {
return LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
}
static int perf_test() {
int SZ = ITERATIONS_PERF;
clock_t t1, t2;
double nTime, cTime, startTime, totalTime;
nTime = cTime = 0.0;
char bfr[BUFFERSIZE];
const size_t max_n = BUFFERSIZE - 1;
/// set seed for our random
srand (static_cast <unsigned> (time(0)));
float* randoms = new float[SZ];
for (int i = 0; i < SZ; i++) {
randoms[i] = getRandom();
}
////////////////////////////////
/// sprintf
t1 = clock();
for (int i = 0; i < SZ; i++) {
snprintf(bfr, max_n, SPRINTF_STR, randoms[i]);
}
t2 = clock();
startTime = (double)t1;
nTime = ((double)(t2 - t1) / 1000000.0 ) * 1000.0;
////////////////////////////////
/// custom format
t1 = clock();
for (int i = 0; i < SZ; i++) {
floatToString(bfr, randoms[i]);
}
t2 = clock();
cTime = ((double)(t2 - t1) / 1000000.0 ) * 1000.0;
totalTime = ((double)(t2 - startTime) / 1000000.0 ) * 1000.0;
printf("processed %d random numbers\n", SZ);
printf(
"Done processing."
"\nTotal time spent: %6.6f ms"
"\nsprintf : %6.6f ms"
"\ncustom : %6.6f ms\n",
totalTime,
nTime,
cTime
);
if (nTime < cTime) {
printf("native sprintf is %6.6f times faster\n", (cTime / nTime));
} else if (nTime > cTime) {
printf("custom is %6.6f times faster\n", (nTime / cTime));
} else {
printf("Both exhibited the same exact performance.\n");
}
delete[] randoms;
return 0;
}
static int logical_test() {
char bfr[BUFFERSIZE];
const size_t max_n = BUFFERSIZE - 1;
/// set seed for "random"
srand (static_cast <unsigned> (time(0)));
int SZ = ITERATIONS_LOGICAL;
float* randoms = new float[SZ];
for (int i = 0; i < SZ; i++) {
randoms[i] = getRandom();
}
printf("sprintf impl\t\tfloatToString impl\n\n");
for (int i = 0; i < SZ; i++) {
snprintf(bfr, max_n, SPRINTF_STR, randoms[i]);
printf("%s\t", bfr);
floatToString(bfr, randoms[i]);
printf("%s\n", bfr);
}
delete[] randoms;
return 0;
}
int main() {
perf_test();
printf("\n");
logical_test();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment