Skip to content

Instantly share code, notes, and snippets.

@toughengineer
Last active October 13, 2021 10:33
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save toughengineer/5bd95c2c6867ad1dcdcd6f8c33042c33 to your computer and use it in GitHub Desktop.
Save toughengineer/5bd95c2c6867ad1dcdcd6f8c33042c33 to your computer and use it in GitHub Desktop.
fast ISO8601 datetime formatting

MSVC 2019

2020-08-07T23:46:38+03:00
Running D:\dev\counting_cycles\x64\Release\counting_cycles.exe
Run on (8 X 3600 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x4)
  L1 Instruction 32 KiB (x4)
  L2 Unified 256 KiB (x4)
  L3 Unified 8192 KiB (x1)
-------------------------------------------------------------
Benchmark                   Time             CPU   Iterations
-------------------------------------------------------------
iso8601_stream           1960 ns         1967 ns       373333
iso8601_stream_fmt        708 ns          711 ns      1120000
iso8601_fmt               391 ns          381 ns      1723077
iso8601_strftime          274 ns          270 ns      2488889
iso8601_fmt2              188 ns          184 ns      3733333
iso8601_manual           16.7 ns         16.7 ns     44800000
iso8601_fast             5.02 ns         5.00 ns    100000000

2021-10-13 Update: iso8601_strftime measurement propbably was incorrect because there was a bug in generation of values for tm structure.
Also I moved stream and buffer initialization out of the measurement loop (stream formatting performance is already bad on its own).
(Additionally I replaced write2Decimals implementation with one stolen from fmtlib, which made it a tiny bit faster.)
And I added "manual fast" method, which assumes year is no more than 4 decimal digits (and is hence faster).
Here are the updated results on the same machine with
MSVC 2019 16.11.4 with -std:c++20 and latest (trunk) fmtlib:

2021-10-13T13:30:48+03:00
Running D:\dev\counting_cycles\x64\Release\counting_cycles.exe
Run on (8 X 3600 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x4)
  L1 Instruction 32 KiB (x4)
  L2 Unified 256 KiB (x4)
  L3 Unified 8192 KiB (x1)
--------------------------------------------------------------
Benchmark                    Time             CPU   Iterations
--------------------------------------------------------------
iso8601_stream            1607 ns         1639 ns       448000
iso8601_stream_fmt         416 ns          414 ns      1659259
iso8601_fmt                324 ns          328 ns      2240000
iso8601_strftime           316 ns          321 ns      2240000
iso8601_fmt2               187 ns          188 ns      3733333
iso8601_manual            15.8 ns         15.7 ns     44800000
iso8601_manual_fast       5.71 ns         5.78 ns    100000000
iso8601_fast              4.31 ns         4.33 ns    165925926

I still have no capable CPU to benchmark/tweak the SIMD version, it seems to be slower than "fast" version
and it requires AVX512 for packed 64 bit integer multiply (_mm_mullo_epi64 mapping to vpmullq).
That said, I wouldn't use the SIMD version (unless it can be tweaked to run faster and that's really what you want).

#include <iostream>
#include <sstream>
#include <iomanip>
#include <string_view>
#include <charconv>
#include <ctime>
#include <cstring>
#include <benchmark/benchmark.h>
#include <fmt/ostream.h>
#include <fmt/compile.h>
#include <immintrin.h>
void write2Decimals(char *s, uint32_t n) {
const char *const digits =
"00010203040506070809"
"10111213141516171819"
"20212223242526272829"
"30313233343536373839"
"40414243444546474849"
"50515253545556575859"
"60616263646566676869"
"70717273747576777879"
"80818283848586878889"
"90919293949596979899" + n;
s[0] = digits[0];
s[1] = digits[1];
};
namespace fast {
// write in the format 'aa bb cc'
uint64_t writeTwoDecimalsWithSeparatorImpl(uint32_t a, uint32_t b, uint32_t c)
{
// send values to destination bytes
uint64_t digits = a | (b << 24) | (uint64_t(c) << 48);
// convert each value to BCD
/* we have x = a*10 + b and we want to convert it to BCD:
y = a*16 + b
the difference is
y - x = a * 6
a can be found from x:
a = floor(x / 10)
then
y = x + a * 6 = x + floor(x / 10) * 6
floor(x / 10) is (x * 205) >> 11 (needs 16 bits) */
digits += (((digits * 205) >> 11) & 0x000f0000'0f00000f) * 6;
// put low hex digit to high byte and high hex digit to low byte
return ((digits & 0x00f00000'f00000f0) >> 4) | ((digits & 0x000f0000'0f00000f) << 8);
}
template<char separator>
void writeTwoDecimalsWithSeparator(char *s, uint32_t a, uint32_t b, uint32_t c)
{
uint64_t digits = writeTwoDecimalsWithSeparatorImpl(a, b, c);
// add value of '0' ASCII char to each byte
digits |= 0x30300030'30003030 | (uint64_t(separator) << 16) | (uint64_t(separator) << 40);
memcpy(s, &digits, 8);
}
void formatISO8601(char *s, uint32_t year, uint32_t month, uint32_t day, uint32_t hours, uint32_t minutes, uint32_t seconds) {
const auto yearHundreds = year / 100;
const auto yearTensAndOnes = year % 100;
write2Decimals(s, yearHundreds);
writeTwoDecimalsWithSeparator<'-'>(s + 2, yearTensAndOnes, month, day);
s[10] = 'T';
writeTwoDecimalsWithSeparator<':'>(s + 11, hours, minutes, seconds);
s[19] = 'Z';
}
}//namespace fast
namespace simd {
// write in the format 'aa bb cc'
auto writeTwoDecimalsWithSeparatorImpl(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, uint32_t f)
{
// send values to destination bytes
auto digits = _mm_set_epi64x(d | (e << 24) | (uint64_t(f) << 48), a | (b << 24) | (uint64_t(c) << 48));
// convert each value to BCD
/* we have x = a*10 + b and we want to convert it to BCD:
y = a*16 + b
the difference is
y - x = a * 6
a can be found from x:
a = floor(x / 10)
then
y = x + a * 6 = x + floor(x / 10) * 6
floor(x / 10) is (x * 205) >> 11 (needs 16 bits) */
const auto diff = _mm_srli_epi64(_mm_mullo_epi64(digits, _mm_set1_epi64x(205)), 11);
const auto loDigitMask = _mm_set1_epi64x(0x000f0000'0f00000f);
digits = _mm_add_epi64(digits, _mm_mullo_epi64(_mm_and_si128(diff, loDigitMask), _mm_set1_epi64x(6)));
const auto loBytes = _mm_srli_epi64(_mm_and_si128(digits, _mm_set1_epi64x(0x00f00000'f00000f0)), 4);
const auto hiBytes = _mm_slli_epi64(_mm_and_si128(digits, loDigitMask), 8);
const auto asciiMaskAndSeparators = _mm_set_epi64x(
0x30300030'30003030 | (uint64_t(':') << 16) | (uint64_t(':') << 40),
0x30300030'30003030 | (uint64_t('-') << 16) | (uint64_t('-') << 40));
return _mm_or_si128(_mm_or_si128(loBytes, hiBytes), asciiMaskAndSeparators);
}
void formatISO8601(char *s, uint32_t year, uint32_t month, uint32_t day, uint32_t hours, uint32_t minutes, uint32_t seconds) {
const auto yearHundreds = year / 100;
const auto yearTensAndUnits = year % 100;
s[0] = static_cast<char>('0' + yearHundreds / 10);
s[1] = static_cast<char>('0' + yearHundreds % 10);
const auto result = writeTwoDecimalsWithSeparatorImpl(yearTensAndUnits, month, day, hours, minutes, seconds);
_mm_storeu_si128(reinterpret_cast<__m128i *>(s + 2), result);
std::memmove(s + 11, s + 10, 8);
s[10] = 'T';
s[19] = 'Z';
}
}//namespace simd
namespace manual {
void formatISO8601(char *s, uint32_t year, uint32_t month, uint32_t day, uint32_t hours, uint32_t minutes, uint32_t seconds) {
//0123-56-89T12:45:78Z
// 0 0 1111111111
//01234567890123456789
const size_t yearLength = std::min(uint32_t(year != 0) + uint32_t(year >= 10) + uint32_t(year >= 100) + uint32_t(year >= 1000), uint32_t(4));
memset(s, '0', 4 - yearLength);
std::to_chars(s + 4 - yearLength, s + 4, year);
s[4] = '-';
write2Decimals(s + 5, month);
s[7] = '-';
write2Decimals(s + 8, day);
s[10] = 'T';
write2Decimals(s + 11, hours);
s[13] = ':';
write2Decimals(s + 14, minutes);
s[16] = ':';
write2Decimals(s + 17, seconds);
s[19] = 'Z';
}
}//namespace manual
namespace manual_fast {
void formatISO8601(char *s, uint32_t year, uint32_t month, uint32_t day, uint32_t hours, uint32_t minutes, uint32_t seconds) {
//0123-56-89T12:45:78Z
// 0 0 1111111111
//01234567890123456789
write2Decimals(s, year / 100);
write2Decimals(s + 2, year % 100);
s[4] = '-';
write2Decimals(s + 5, month);
s[7] = '-';
write2Decimals(s + 8, day);
s[10] = 'T';
write2Decimals(s + 11, hours);
s[13] = ':';
write2Decimals(s + 14, minutes);
s[16] = ':';
write2Decimals(s + 17, seconds);
s[19] = 'Z';
}
}//namespace manual_fast
static void iso8601_stream(benchmark::State &state) {
srand(time(0));
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
std::ostringstream s;
for (auto _ : state) {
s.clear();
using std::setw;
const auto fillChar = s.fill('0');
s << a << '-' << setw(2) << b << '-' << setw(2) << c << 'T'
<< setw(2) << d << ':' << setw(2) << e << ':' << setw(2) << f << 'Z';
s.fill(fillChar);
benchmark::DoNotOptimize(s);
}
}
BENCHMARK(iso8601_stream);
static void iso8601_stream_fmt(benchmark::State &state) {
srand(time(0));
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
std::ostringstream s;
for (auto _ : state) {
s.clear();
fmt::print(s, "{}-{:0>2}-{:0>2}T{:0>2}:{:0>2}:{:0>2}Z", a, b, c, d, e, f);
benchmark::DoNotOptimize(s);
}
}
BENCHMARK(iso8601_stream_fmt);
static void iso8601_fmt(benchmark::State &state) {
srand(time(0));
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
char buf[20] = {};
for (auto _ : state) {
fmt::format_to(buf, "{}-{:0>2}-{:0>2}T{:0>2}:{:0>2}:{:0>2}Z", a, b, c, d, e, f);
benchmark::DoNotOptimize(buf);
}
}
BENCHMARK(iso8601_fmt);
static void iso8601_strftime(benchmark::State &state) {
srand(time(0));
tm t;
t.tm_year = rand() % 10000 - 1900;
t.tm_mon = rand() % 12;
t.tm_mday = rand() % 31 + 1;
t.tm_hour = rand() % 24;
t.tm_min = rand() % 60;
t.tm_sec = rand() % 60;
char buf[21] = {};
for (auto _ : state) {
std::strftime(buf, 21, "%FT%TZ", &t);
benchmark::DoNotOptimize(buf);
}
}
BENCHMARK(iso8601_strftime);
static void iso8601_fmt2(benchmark::State &state) {
srand(time(0));
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
char buf[20] = {};
for (auto _ : state) {
fmt::format_to(buf, FMT_COMPILE("{}-{:0>2}-{:0>2}T{:0>2}:{:0>2}:{:0>2}Z"), a, b, c, d, e, f);
benchmark::DoNotOptimize(buf);
}
}
BENCHMARK(iso8601_fmt2);
static void iso8601_manual(benchmark::State &state) {
srand(time(0));
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
char res[20] = {};
for (auto _ : state) {
manual::formatISO8601(res, a, b, c, d, e, f);
benchmark::DoNotOptimize(res);
}
}
BENCHMARK(iso8601_manual);
static void iso8601_manual_fast(benchmark::State &state) {
srand(time(0));
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
char res[20] = {};
for (auto _ : state) {
manual_fast::formatISO8601(res, a, b, c, d, e, f);
benchmark::DoNotOptimize(res);
}
}
BENCHMARK(iso8601_manual_fast);
static void iso8601_fast(benchmark::State &state) {
srand(time(0));
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
char res[20] = {};
for (auto _ : state) {
fast::formatISO8601(res, a, b, c, d, e, f);
benchmark::DoNotOptimize(res);
}
}
BENCHMARK(iso8601_fast);
//int main(int argc, char **argv) {
// tm t;
// t.tm_year = 2020 - 1900;
// t.tm_mon = 11;
// t.tm_mday = 22;
// t.tm_hour = 0;
// t.tm_min = 33;
// t.tm_sec = 55;
//
// char buf[21] = {};
// std::strftime(buf, 21, "%FT%TZ", &t);
// std::cout << std::string_view{ buf, 20 } << '\n';
//
// manual::formatISO8601(buf, t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
// std::cout << std::string_view{ buf, 20 } << '\n';
//
// fast::formatISO8601(buf, t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
// std::cout << std::string_view{ buf, 20 } << '\n';
//
// //simd::formatISO8601(buf, t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
// //std::cout << std::string_view{ buf, 20 } << '\n';
//}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment