Skip to content

Instantly share code, notes, and snippets.

@foonathan
Created December 2, 2020 07:44
Show Gist options
  • Save foonathan/6777a1a3453dbc369179d902340d986d to your computer and use it in GitHub Desktop.
Save foonathan/6777a1a3453dbc369179d902340d986d to your computer and use it in GitHub Desktop.
benchmark: parsing ipv4 with Boost.Spirit and lexy
#include <benchmark/benchmark.h>
#include <random>
#include <string>
#include <vector>
constexpr auto ipv4_count = 8 * 1024u;
std::vector<std::string> generate_ips()
{
std::vector<std::string> result;
std::default_random_engine engine;
std::uniform_int_distribution<std::uint8_t> dist;
for (auto i = 0u; i != ipv4_count; ++i)
{
auto a = dist(engine);
auto b = dist(engine);
auto c = dist(engine);
auto d = dist(engine);
result.push_back(std::to_string(a) + '.' + std::to_string(b) + '.' + std::to_string(c) + '.'
+ std::to_string(d));
}
return result;
}
#include <lexy/dsl.hpp>
#include <lexy/input/string_input.hpp>
#include <lexy/parse.hpp>
namespace dsl = lexy::dsl;
// Parse an IPv4 address into a `std::uint32_t`.
struct ipv4_address
{
// What is being matched.
static constexpr auto rule = [] {
// Match a sequence of (decimal) digits and convert it into a std::uint8_t.
auto octet = dsl::integer<std::uint8_t>(dsl::digits<>);
// Match four of them separated by periods.
return dsl::times<4>(octet, dsl::sep(dsl::period));
}();
// How the matched output is being stored.
static constexpr auto value
= lexy::callback<std::uint32_t>([](lexy::times<4, std::uint8_t> octets) {
std::uint32_t result = 0;
for (auto o : octets)
{
result <<= 8;
result |= o;
}
return result;
});
};
void bm_lexy(benchmark::State& state)
{
auto ips = generate_ips();
std::uint64_t dummy = 0;
std::size_t bytes = 0;
for (auto _ : state)
{
for (auto& ip : ips)
{
auto input = lexy::string_input(ip);
auto result = lexy::parse<ipv4_address>(input, lexy::noop);
dummy += result.value();
bytes += ip.size();
benchmark::DoNotOptimize(result);
}
}
state.SetBytesProcessed(std::int64_t(bytes));
benchmark::DoNotOptimize(dummy);
}
BENCHMARK(bm_lexy);
#include <boost/fusion/adapted/array.hpp>
#include <boost/spirit/home/x3.hpp>
namespace x3 = boost::spirit::x3;
template <typename Iterator>
std::uint32_t parse_ipv4(Iterator first, Iterator last)
{
std::uint8_t octets[4] = {};
auto octet = x3::uint_parser<std::uint8_t>{};
auto result
= x3::parse(first, last, octet >> '.' >> octet >> '.' >> octet >> '.' >> octet, octets);
if (!result)
return std::uint32_t(-1);
std::uint32_t ipv4 = 0;
for (auto o : octets)
{
ipv4 <<= 8;
ipv4 |= o;
}
return ipv4;
}
void bm_spirit(benchmark::State& state)
{
auto ips = generate_ips();
std::uint64_t dummy = 0;
std::size_t bytes = 0;
for (auto _ : state)
{
for (auto& ip : ips)
{
auto result = parse_ipv4(ip.begin(), ip.end());
dummy += result;
bytes += ip.size();
benchmark::DoNotOptimize(result);
}
}
state.SetBytesProcessed(std::int64_t(bytes));
benchmark::DoNotOptimize(dummy);
}
BENCHMARK(bm_spirit);
BENCHMARK_MAIN();
@foonathan
Copy link
Author

Results on my machine:

2020-12-02T08:46:34+01:00
Running ./a.out
Run on (6 X 3500 MHz CPU s)
CPU Caches:
  L1 Data 16 KiB (x6)
  L1 Instruction 64 KiB (x3)
  L2 Unified 2048 KiB (x3)
  L3 Unified 8192 KiB (x1)
Load Average: 0.99, 2.11, 2.24
---------------------------------------------------------------------
Benchmark           Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------
bm_lexy        377042 ns       375207 ns         1850 bytes_per_second=276.509M/s
bm_spirit     3705316 ns      3689359 ns          181 bytes_per_second=28.121M/s

@foonathan
Copy link
Author

Someone else has more reasonable performance differences: https://twitter.com/chsiedentop/status/1334052430032597003

@ilyapopov
Copy link

I get results similar to yours (5x performance difference):

2020-12-02T16:51:01+01:00
Running ./a.out
Run on (4 X 2800 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 3072 KiB (x1)
Load Average: 0.48, 0.49, 0.53
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
---------------------------------------------------------------------
Benchmark           Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------
bm_lexy        520376 ns       520173 ns         1309 bytes_per_second=199.45M/s
bm_spirit     2523099 ns      2521601 ns          278 bytes_per_second=41.1438M/s

Ubuntu 20.10
gcc (Ubuntu 10.2.0-13ubuntu1) 10.2.0
boost 1.71.0.0ubuntu4 from repos

Command line:

g++ -O3 -std=c++17 benchmark.cpp -I ~/devel/external/benchmark/include/ -I lexy/include/ -lboost_system -lboost_exception -L ~/devel/external/benchmark/build/src -lbenchmark -lpthread

@foonathan
Copy link
Author

It seems to be a regression that was introduced sometime between Boost version 1.65 and 1.71 and fixed sometime after 1.72 and before 1.74.

@foonathan
Copy link
Author

Okay, just adding -DNDEBUG made Boost.spirit faster...

2020-12-02T21:01:42+01:00
Running ./a.out
Run on (6 X 3500 MHz CPU s)
CPU Caches:
  L1 Data 16 KiB (x6)
  L1 Instruction 64 KiB (x3)
  L2 Unified 2048 KiB (x3)
  L3 Unified 8192 KiB (x1)
Load Average: 1.83, 1.64, 1.45
---------------------------------------------------------------------
Benchmark           Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------
bm_lexy        387691 ns       385575 ns         1861 bytes_per_second=269.074M/s
bm_spirit      843274 ns       838475 ns          840 bytes_per_second=123.735M/s

Guess that explains part of it.

@ilyapopov
Copy link

Yes, indeed. With -DNDEBUG boost is now faster for me:

2020-12-02T21:06:57+01:00
Running ./a.out
Run on (4 X 2800 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 3072 KiB (x1)
Load Average: 0.43, 0.42, 0.47
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
---------------------------------------------------------------------
Benchmark           Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------
bm_lexy        518663 ns       518646 ns         1292 bytes_per_second=200.037M/s
bm_spirit      355924 ns       355924 ns         1963 bytes_per_second=291.49M/s

@foonathan
Copy link
Author

foonathan commented Dec 2, 2020

Can you try the feature/optimize-ip-assembly branch?

Edit: It's now merged into main.

@ilyapopov
Copy link

With current master (e82030f326f4def228b78b11dc1ca54b95eb8aae):

2020-12-03T01:21:02+01:00
Running ./a.out
Run on (4 X 2800 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 3072 KiB (x1)
Load Average: 0.26, 0.62, 0.81
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
---------------------------------------------------------------------
Benchmark           Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------
bm_lexy        395142 ns       394845 ns         1781 bytes_per_second=262.757M/s
bm_spirit      363791 ns       363624 ns         1929 bytes_per_second=285.317M/s

@foonathan
Copy link
Author

Good, it became faster. Thank you.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment