-
-
Save foonathan/6777a1a3453dbc369179d902340d986d to your computer and use it in GitHub Desktop.
benchmark: parsing ipv4 with Boost.Spirit and lexy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <benchmark/benchmark.h> | |
#include <random> | |
#include <string> | |
#include <vector> | |
constexpr auto ipv4_count = 8 * 1024u; | |
std::vector<std::string> generate_ips() | |
{ | |
std::vector<std::string> result; | |
std::default_random_engine engine; | |
std::uniform_int_distribution<std::uint8_t> dist; | |
for (auto i = 0u; i != ipv4_count; ++i) | |
{ | |
auto a = dist(engine); | |
auto b = dist(engine); | |
auto c = dist(engine); | |
auto d = dist(engine); | |
result.push_back(std::to_string(a) + '.' + std::to_string(b) + '.' + std::to_string(c) + '.' | |
+ std::to_string(d)); | |
} | |
return result; | |
} | |
#include <lexy/dsl.hpp> | |
#include <lexy/input/string_input.hpp> | |
#include <lexy/parse.hpp> | |
namespace dsl = lexy::dsl; | |
// Parse an IPv4 address into a `std::uint32_t`. | |
struct ipv4_address | |
{ | |
// What is being matched. | |
static constexpr auto rule = [] { | |
// Match a sequence of (decimal) digits and convert it into a std::uint8_t. | |
auto octet = dsl::integer<std::uint8_t>(dsl::digits<>); | |
// Match four of them separated by periods. | |
return dsl::times<4>(octet, dsl::sep(dsl::period)); | |
}(); | |
// How the matched output is being stored. | |
static constexpr auto value | |
= lexy::callback<std::uint32_t>([](lexy::times<4, std::uint8_t> octets) { | |
std::uint32_t result = 0; | |
for (auto o : octets) | |
{ | |
result <<= 8; | |
result |= o; | |
} | |
return result; | |
}); | |
}; | |
void bm_lexy(benchmark::State& state) | |
{ | |
auto ips = generate_ips(); | |
std::uint64_t dummy = 0; | |
std::size_t bytes = 0; | |
for (auto _ : state) | |
{ | |
for (auto& ip : ips) | |
{ | |
auto input = lexy::string_input(ip); | |
auto result = lexy::parse<ipv4_address>(input, lexy::noop); | |
dummy += result.value(); | |
bytes += ip.size(); | |
benchmark::DoNotOptimize(result); | |
} | |
} | |
state.SetBytesProcessed(std::int64_t(bytes)); | |
benchmark::DoNotOptimize(dummy); | |
} | |
BENCHMARK(bm_lexy); | |
#include <boost/fusion/adapted/array.hpp> | |
#include <boost/spirit/home/x3.hpp> | |
namespace x3 = boost::spirit::x3; | |
template <typename Iterator> | |
std::uint32_t parse_ipv4(Iterator first, Iterator last) | |
{ | |
std::uint8_t octets[4] = {}; | |
auto octet = x3::uint_parser<std::uint8_t>{}; | |
auto result | |
= x3::parse(first, last, octet >> '.' >> octet >> '.' >> octet >> '.' >> octet, octets); | |
if (!result) | |
return std::uint32_t(-1); | |
std::uint32_t ipv4 = 0; | |
for (auto o : octets) | |
{ | |
ipv4 <<= 8; | |
ipv4 |= o; | |
} | |
return ipv4; | |
} | |
void bm_spirit(benchmark::State& state) | |
{ | |
auto ips = generate_ips(); | |
std::uint64_t dummy = 0; | |
std::size_t bytes = 0; | |
for (auto _ : state) | |
{ | |
for (auto& ip : ips) | |
{ | |
auto result = parse_ipv4(ip.begin(), ip.end()); | |
dummy += result; | |
bytes += ip.size(); | |
benchmark::DoNotOptimize(result); | |
} | |
} | |
state.SetBytesProcessed(std::int64_t(bytes)); | |
benchmark::DoNotOptimize(dummy); | |
} | |
BENCHMARK(bm_spirit); | |
BENCHMARK_MAIN(); |
Someone else has more reasonable performance differences: https://twitter.com/chsiedentop/status/1334052430032597003
I get results similar to yours (5x performance difference):
2020-12-02T16:51:01+01:00
Running ./a.out
Run on (4 X 2800 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 3072 KiB (x1)
Load Average: 0.48, 0.49, 0.53
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
---------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
---------------------------------------------------------------------
bm_lexy 520376 ns 520173 ns 1309 bytes_per_second=199.45M/s
bm_spirit 2523099 ns 2521601 ns 278 bytes_per_second=41.1438M/s
Ubuntu 20.10
gcc (Ubuntu 10.2.0-13ubuntu1) 10.2.0
boost 1.71.0.0ubuntu4 from repos
Command line:
g++ -O3 -std=c++17 benchmark.cpp -I ~/devel/external/benchmark/include/ -I lexy/include/ -lboost_system -lboost_exception -L ~/devel/external/benchmark/build/src -lbenchmark -lpthread
It seems to be a regression that was introduced sometime between Boost version 1.65 and 1.71 and fixed sometime after 1.72 and before 1.74.
Okay, just adding -DNDEBUG
made Boost.spirit faster...
2020-12-02T21:01:42+01:00
Running ./a.out
Run on (6 X 3500 MHz CPU s)
CPU Caches:
L1 Data 16 KiB (x6)
L1 Instruction 64 KiB (x3)
L2 Unified 2048 KiB (x3)
L3 Unified 8192 KiB (x1)
Load Average: 1.83, 1.64, 1.45
---------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
---------------------------------------------------------------------
bm_lexy 387691 ns 385575 ns 1861 bytes_per_second=269.074M/s
bm_spirit 843274 ns 838475 ns 840 bytes_per_second=123.735M/s
Guess that explains part of it.
Yes, indeed. With -DNDEBUG
boost is now faster for me:
2020-12-02T21:06:57+01:00
Running ./a.out
Run on (4 X 2800 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 3072 KiB (x1)
Load Average: 0.43, 0.42, 0.47
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
---------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
---------------------------------------------------------------------
bm_lexy 518663 ns 518646 ns 1292 bytes_per_second=200.037M/s
bm_spirit 355924 ns 355924 ns 1963 bytes_per_second=291.49M/s
Can you try the feature/optimize-ip-assembly branch?
Edit: It's now merged into main.
With current master (e82030f326f4def228b78b11dc1ca54b95eb8aae):
2020-12-03T01:21:02+01:00
Running ./a.out
Run on (4 X 2800 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 3072 KiB (x1)
Load Average: 0.26, 0.62, 0.81
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
---------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
---------------------------------------------------------------------
bm_lexy 395142 ns 394845 ns 1781 bytes_per_second=262.757M/s
bm_spirit 363791 ns 363624 ns 1929 bytes_per_second=285.317M/s
Good, it became faster. Thank you.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Results on my machine: