Skip to content

Instantly share code, notes, and snippets.

@Lectem
Forked from toughengineer/README.md
Created September 10, 2021 14:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Lectem/13dc3d43346e1c5efb0c21dd1975b964 to your computer and use it in GitHub Desktop.
Save Lectem/13dc3d43346e1c5efb0c21dd1975b964 to your computer and use it in GitHub Desktop.
fast ISO8601 datetime formatting

MSVC 2019

2020-08-07T23:46:38+03:00
Running D:\dev\counting_cycles\x64\Release\counting_cycles.exe
Run on (8 X 3600 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x4)
  L1 Instruction 32 KiB (x4)
  L2 Unified 256 KiB (x4)
  L3 Unified 8192 KiB (x1)
-------------------------------------------------------------
Benchmark                   Time             CPU   Iterations
-------------------------------------------------------------
iso8601_stream           1960 ns         1967 ns       373333
iso8601_stream_fmt        708 ns          711 ns      1120000
iso8601_fmt               391 ns          381 ns      1723077
iso8601_strftime          274 ns          270 ns      2488889
iso8601_fmt2              188 ns          184 ns      3733333
iso8601_manual           16.7 ns         16.7 ns     44800000
iso8601_fast             5.02 ns         5.00 ns    100000000

I have no capable CPU to benchmark the SIMD version, but it seems to be still a bit faster with Clang under linux (https://godbolt.org/z/n4Kzv7)
though it requires AVX512 for packed 64 bit integer multiply (_mm_mullo_epi64 mapping to vpmullq).
That said, I wouldn't use the SIMD version unless absolutely necessary.

#include <iostream>
#include <sstream>
#include <iomanip>
#include <string_view>
#include <charconv>
#include <ctime>
#include <cstring>
#include <benchmark/benchmark.h>
#include <fmt/ostream.h>
#include <fmt/compile.h>
#include <immintrin.h>
namespace fast {
// write in the format 'aa bb cc'
uint64_t writeTwoDecimalsWithSeparatorImpl(uint32_t a, uint32_t b, uint32_t c)
{
// send values to destination bytes
uint64_t digits = a | (b << 24) | (uint64_t(c) << 48);
// convert each value to BCD
/* we have x = a*10 + b and we want to convert it to BCD:
y = a*16 + b
the difference is
y - x = a * 6
a can be found from x:
a = floor(x / 10)
then
y = x + a * 6 = x + floor(x / 10) * 6
floor(x / 10) is (x * 205) >> 11 (needs 16 bits) */
digits += (((digits * 205) >> 11) & 0x000f0000'0f00000f) * 6;
// put low hex digit to high byte and high hex digit to low byte
return ((digits & 0x00f00000'f00000f0) >> 4) | ((digits & 0x000f0000'0f00000f) << 8);
}
template<char separator>
void writeTwoDecimalsWithSeparator(char(&s)[8], uint32_t a, uint32_t b, uint32_t c)
{
uint64_t digits = writeTwoDecimalsWithSeparatorImpl(a, b, c);
// add value of '0' ASCII char to each byte
digits |= 0x30300030'30003030 | (uint64_t(separator) << 16) | (uint64_t(separator) << 40);
memcpy(s, &digits, 8);
}
void formatISO8601(char *s, uint32_t year, uint32_t month, uint32_t day, uint32_t hours, uint32_t minutes, uint32_t seconds) {
const auto yearHundreds = year / 100;
const auto yearTensAndUnits = year % 100;
s[0] = static_cast<char>('0' + yearHundreds / 10);
s[1] = static_cast<char>('0' + yearHundreds % 10);
char bufD[8] = {};
writeTwoDecimalsWithSeparator<'-'>(bufD, yearTensAndUnits, month, day);
memcpy(s + 2, bufD, 8);
s[10] = 'T';
char bufT[8] = {};
writeTwoDecimalsWithSeparator<':'>(bufT, hours, minutes, seconds);
memcpy(s + 11, bufT, 8);
s[19] = 'Z';
}
}//namespace fast
namespace simd {
// write in the format 'aa bb cc'
auto writeTwoDecimalsWithSeparatorImpl(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, uint32_t f)
{
// send values to destination bytes
auto digits = _mm_set_epi64x(d | (e << 24) | (uint64_t(f) << 48), a | (b << 24) | (uint64_t(c) << 48));
// convert each value to BCD
/* we have x = a*10 + b and we want to convert it to BCD:
y = a*16 + b
the difference is
y - x = a * 6
a can be found from x:
a = floor(x / 10)
then
y = x + a * 6 = x + floor(x / 10) * 6
floor(x / 10) is (x * 205) >> 11 (needs 16 bits) */
const auto diff = _mm_srli_epi64(_mm_mullo_epi64(digits, _mm_set1_epi64x(205)), 11);
const auto loDigitMask = _mm_set1_epi64x(0x000f0000'0f00000f);
digits = _mm_add_epi64(digits, _mm_mullo_epi64(_mm_and_si128(diff, loDigitMask), _mm_set1_epi64x(6)));
const auto loBytes = _mm_srli_epi64(_mm_and_si128(digits, _mm_set1_epi64x(0x00f00000'f00000f0)), 4);
const auto hiBytes = _mm_slli_epi64(_mm_and_si128(digits, loDigitMask), 8);
const auto asciiMaskAndSeparators = _mm_set_epi64x(
0x30300030'30003030 | (uint64_t(':') << 16) | (uint64_t(':') << 40),
0x30300030'30003030 | (uint64_t('-') << 16) | (uint64_t('-') << 40));
return _mm_or_si128(_mm_or_si128(loBytes, hiBytes), asciiMaskAndSeparators);
}
void formatISO8601(char *s, uint32_t year, uint32_t month, uint32_t day, uint32_t hours, uint32_t minutes, uint32_t seconds) {
const auto yearHundreds = year / 100;
const auto yearTensAndUnits = year % 100;
s[0] = static_cast<char>('0' + yearHundreds / 10);
s[1] = static_cast<char>('0' + yearHundreds % 10);
const auto result = writeTwoDecimalsWithSeparatorImpl(yearTensAndUnits, month, day, hours, minutes, seconds);
_mm_storeu_si128(reinterpret_cast<__m128i *>(s + 2), result);
std::memmove(s + 11, s + 10, 8);
s[10] = 'T';
s[19] = 'Z';
}
}//namespace simd
namespace manual {
void formatISO8601(char *s, uint32_t year, uint32_t month, uint32_t day, uint32_t hours, uint32_t minutes, uint32_t seconds) {
auto write2Decimals = [](char *s, uint32_t n) {
s[0] = static_cast<char>('0' + n / 10);
s[1] = static_cast<char>('0' + n % 10);
};
//0123-56-89T12:45:78Z
// 0 0 1111111111
//01234567890123456789
const size_t yearLength = uint32_t(year != 0) + uint32_t(year >= 10) + uint32_t(year >= 100) + uint32_t(year >= 1000);
memset(s, '0', 4 - yearLength);
std::to_chars(s + 4 - yearLength, s + 4, year);
s[4] = '-';
write2Decimals(s + 5, month);
s[7] = '-';
write2Decimals(s + 8, day);
s[10] = 'T';
write2Decimals(s + 11, hours);
s[13] = ':';
write2Decimals(s + 14, minutes);
s[16] = ':';
write2Decimals(s + 17, seconds);
s[19] = 'Z';
}
}//namespace manual
static void iso8601_stream(benchmark::State &state) {
srand(0);
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
for (auto _ : state) {
std::ostringstream s;
using std::setw;
const auto fillChar = s.fill('0');
s << a << '-' << setw(2) << b << '-' << setw(2) << c << 'T'
<< setw(2) << d << ':' << setw(2) << e << ':' << setw(2) << f << 'Z';
s.fill(fillChar);
benchmark::DoNotOptimize(s);
}
}
BENCHMARK(iso8601_stream);
static void iso8601_stream_fmt(benchmark::State &state) {
srand(0);
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
for (auto _ : state) {
std::ostringstream s;
fmt::print(s, "{}-{:0>2}-{:0>2}T{:0>2}:{:0>2}:{:0>2}Z", a, b, c, d, e, f);
benchmark::DoNotOptimize(s);
}
}
BENCHMARK(iso8601_stream_fmt);
static void iso8601_fmt(benchmark::State &state) {
srand(0);
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
for (auto _ : state) {
char buf[20] = {};
fmt::format_to(buf, "{}-{:0>2}-{:0>2}T{:0>2}:{:0>2}:{:0>2}Z", a, b, c, d, e, f);
benchmark::DoNotOptimize(buf);
}
}
BENCHMARK(iso8601_fmt);
static void iso8601_strftime(benchmark::State &state) {
srand(0);
tm t;
t.tm_year = rand() % 10000 - 1900;
t.tm_mon = rand() % 12 + 1;
t.tm_mday = rand() % 31;
t.tm_hour = rand() % 24;
t.tm_min = rand() % 60;
t.tm_sec = rand() % 60;
for (auto _ : state) {
char buf[21] = {};
std::strftime(buf, 21, "%FT%TZ", &t);
benchmark::DoNotOptimize(buf);
}
}
BENCHMARK(iso8601_strftime);
static void iso8601_fmt2(benchmark::State &state) {
srand(0);
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
for (auto _ : state) {
char buf[20] = {};
fmt::format_to(buf, FMT_COMPILE("{}-{:0>2}-{:0>2}T{:0>2}:{:0>2}:{:0>2}Z"), a, b, c, d, e, f);
benchmark::DoNotOptimize(buf);
}
}
BENCHMARK(iso8601_fmt2);
static void iso8601_manual(benchmark::State &state) {
srand(0);
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
auto write2Decimals = [](char *s, uint32_t n) {
s[0] = static_cast<char>('0' + n / 10);
s[1] = static_cast<char>('0' + n % 10);
};
for (auto _ : state) {
char res[20] = {};
manual::formatISO8601(res, a, b, c, d, e, f);
benchmark::DoNotOptimize(res);
}
}
BENCHMARK(iso8601_manual);
static void iso8601_fast(benchmark::State &state) {
srand(0);
const uint32_t
a = rand() % 10000,
b = rand() % 100,
c = rand() % 100,
d = rand() % 100,
e = rand() % 100,
f = rand() % 100;
for (auto _ : state) {
char res[20] = {};
fast::formatISO8601(res, a, b, c, d, e, f);
benchmark::DoNotOptimize(res);
}
}
BENCHMARK(iso8601_fast);
//int main(int argc, char **argv) {
// tm t;
// t.tm_year = 2020 - 1900;
// t.tm_mon = 11;
// t.tm_mday = 22;
// t.tm_hour = 0;
// t.tm_min = 33;
// t.tm_sec = 55;
//
// char buf[21] = {};
// std::strftime(buf, 21, "%FT%TZ", &t);
// std::cout << std::string_view{ buf, 20 } << '\n';
//
// manual::formatISO8601(buf, t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
// std::cout << std::string_view{ buf, 20 } << '\n';
//
// fast::formatISO8601(buf, t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
// std::cout << std::string_view{ buf, 20 } << '\n';
//
// //simd::formatISO8601(buf, t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
// //std::cout << std::string_view{ buf, 20 } << '\n';
//}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment