Created
January 12, 2017 21:14
-
-
Save tgockel/58381c41cb8b18fc8f5632159b3ce56f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** \file crc32.cpp | |
* A number of implementations of a CRC32 algorithm. This program will only work on a CPU with SSE 4.2 support. | |
* | |
* Compile with: | |
* | |
* $> g++ --std=c++17 crc32.cpp -O3 -fwhole-program | |
* | |
* The output is a CSV file showing the timings of the various implementations. | |
**/ | |
#include <algorithm> | |
#include <chrono> | |
#include <cstdint> | |
#include <iomanip> | |
#include <iostream> | |
#include <map> | |
#include <random> | |
#include <stdexcept> | |
#include <string> | |
#include <tuple> | |
#include <utility> | |
#include <vector> | |
#include <boost/crc.hpp> | |
#include <smmintrin.h> | |
#include <x86intrin.h> | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// Boost // | |
// Baseline. Uses bit shift and lookup table. // | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
std::uint32_t crc32_boost(const char* first, const char* last) | |
{ | |
boost::crc_optimal<32, 0x1edc6f41, 0xffffffff, 0xffffffff, true, true> machine; | |
machine.process_block(first, last); | |
return machine.checksum(); | |
} | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// A // | |
// Simple implementation using the crc32b instruction. // | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
__attribute__((target("sse4.2"))) | |
std::uint32_t crc32a(const char* first, const char* last) | |
{ | |
std::uint32_t code = ~0U; | |
for ( ; first != last; ++first) | |
code = _mm_crc32_u8(code, *first); | |
return ~code; | |
} | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// B // | |
// Same as A, but with the for_each algorithm. // | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
__attribute__((target("sse4.2"))) | |
std::uint32_t crc32b(const char* first, const char* last) | |
{ | |
std::uint32_t code = ~0U; | |
std::for_each(first, last, | |
[&] (const char& x) __attribute__((target("sse4.2"))) { code = _mm_crc32_u8(code, x); } | |
); | |
return ~code; | |
} | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// C // | |
// Compute the next step with the largest chunk possible. // | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
__attribute__((target("sse4.2"))) | |
std::uint32_t crc32c(const char* first, const char* last) | |
{ | |
std::uint32_t code = ~0U; | |
while (first != last) | |
{ | |
if (reinterpret_cast<std::uintptr_t>(first) % 8 == 0 && first + 8 <= last) | |
{ | |
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(first)); | |
first += 8; | |
} | |
else if (reinterpret_cast<std::uintptr_t>(first) % 4 == 0 && first + 4 <= last) | |
{ | |
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(first)); | |
first += 4; | |
} | |
else if (reinterpret_cast<std::uintptr_t>(first) % 2 == 0 && first + 2 <= last) | |
{ | |
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(first)); | |
first += 2; | |
} | |
else | |
{ | |
code = _mm_crc32_u8(code, *reinterpret_cast<const std::uint8_t*>(first)); | |
++first; | |
} | |
} | |
return ~code; | |
} | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// D // | |
// Explicit striding. // | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
__attribute__((target("sse4.2"))) | |
std::uint32_t crc32d(const char* first, const char* last) | |
{ | |
std::uint32_t code = ~0U; | |
while (first + 1 <= last | |
&& (reinterpret_cast<std::uintptr_t>(first) % 2 != 0 || first + 2 > last) | |
) | |
{ | |
code = _mm_crc32_u8(code, *reinterpret_cast<const std::uint8_t*>(first)); | |
first += 1; | |
} | |
while (first + 2 <= last | |
&& (reinterpret_cast<std::uintptr_t>(first) % 4 != 0 || first + 4 > last) | |
) | |
{ | |
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(first)); | |
first += 2; | |
} | |
while (first + 4 <= last | |
&& (reinterpret_cast<std::uintptr_t>(first) % 8 != 0 || first + 8 > last) | |
) | |
{ | |
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(first)); | |
first += 4; | |
} | |
while (reinterpret_cast<std::uintptr_t>(first) % 8 == 0 | |
&& first + 8 <= last | |
) | |
{ | |
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(first)); | |
first += 8; | |
} | |
while (first + 4 <= last) | |
{ | |
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(first)); | |
first += 4; | |
} | |
while (first + 2 <= last) | |
{ | |
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(first)); | |
first += 2; | |
} | |
while (first + 1 <= last) | |
{ | |
code = _mm_crc32_u8(code, *reinterpret_cast<const std::uint8_t*>(first)); | |
first += 1; | |
} | |
if (first != last) | |
throw std::runtime_error(""); | |
return ~code; | |
} | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// E // | |
// Geneic | |
// // | |
// Geneic version of algorithm C. // | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
namespace detail | |
{ | |
template <std::size_t... ByteAlign> | |
struct at_each_aligned_e_impl; | |
template <std::size_t ByteAlign, std::size_t... ByteAlignRest> | |
struct at_each_aligned_e_impl<ByteAlign, ByteAlignRest...> | |
{ | |
template <typename TChar, typename FApply, typename... FApplyRest> | |
static TChar* step(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest) | |
{ | |
if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0 | |
&& first + ByteAlign <= last | |
) | |
{ | |
apply(first); | |
return first + ByteAlign; | |
} | |
else | |
{ | |
return at_each_aligned_e_impl<ByteAlignRest...>::step(first, last, apply_rest...); | |
} | |
} | |
}; | |
template <> | |
struct at_each_aligned_e_impl<1> | |
{ | |
template <typename TChar, typename FApply> | |
static TChar* step(TChar* first, TChar*, const FApply& apply) | |
{ | |
apply(first); | |
return first + 1; | |
} | |
}; | |
template <std::size_t... ByteAlign, typename TChar, typename... FApply> | |
void at_each_aligned_e(TChar* first, TChar* last, FApply&&... transform) | |
{ | |
while (first != last) | |
{ | |
first = at_each_aligned_e_impl<ByteAlign...>::step(first, last, transform...); | |
} | |
} | |
} | |
template <std::size_t... ByteAlign, typename... FApply> | |
void at_each_aligned_e(const char* first, const char* last, FApply&&... transform) | |
{ | |
return detail::at_each_aligned_e<ByteAlign...>(first, last, | |
std::forward<FApply>(transform)... | |
); | |
} | |
__attribute__((target("sse4.2"))) | |
std::uint32_t crc32e(const char* first, const char* last) | |
{ | |
std::uint32_t code = ~0U; | |
at_each_aligned_e<8, 4, 2, 1> | |
( | |
first, last, | |
[&] (const char* p64) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64)); | |
}, | |
[&] (const char* p32) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32)); | |
}, | |
[&] (const char* p16) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16)); | |
}, | |
[&] (const char* p8) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8)); | |
} | |
); | |
return ~code; | |
} | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// F // | |
// Same as algorithm E, but with always_inline attributes. // | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
namespace detail | |
{ | |
template <std::size_t... ByteAlign> | |
struct at_each_aligned_f_impl; | |
template <std::size_t ByteAlign, std::size_t... ByteAlignRest> | |
struct at_each_aligned_f_impl<ByteAlign, ByteAlignRest...> | |
{ | |
template <typename TChar, typename FApply, typename... FApplyRest> | |
__attribute__((always_inline)) inline | |
static TChar* step(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest) | |
{ | |
if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0 | |
&& first + ByteAlign <= last | |
) | |
{ | |
apply(first); | |
return first + ByteAlign; | |
} | |
else | |
{ | |
return at_each_aligned_f_impl<ByteAlignRest...>::step(first, last, apply_rest...); | |
} | |
} | |
}; | |
template <> | |
struct at_each_aligned_f_impl<1> | |
{ | |
template <typename TChar, typename FApply> | |
__attribute__((always_inline)) inline | |
static TChar* step(TChar* first, TChar*, const FApply& apply) | |
{ | |
apply(first); | |
return first + 1; | |
} | |
}; | |
template <std::size_t... ByteAlign, typename TChar, typename... FApply> | |
__attribute__((always_inline)) inline | |
void at_each_aligned_f(TChar* first, TChar* last, FApply&&... transform) | |
{ | |
while (first != last) | |
{ | |
first = at_each_aligned_f_impl<ByteAlign...>::step(first, last, transform...); | |
} | |
} | |
} | |
template <std::size_t... ByteAlign, typename... FApply> | |
__attribute__((always_inline)) inline | |
void at_each_aligned_f(const char* first, const char* last, FApply&&... transform) | |
{ | |
return detail::at_each_aligned_f<ByteAlign...>(first, last, | |
std::forward<FApply>(transform)... | |
); | |
} | |
__attribute__((target("sse4.2"))) | |
std::uint32_t crc32f(const char* first, const char* last) | |
{ | |
std::uint32_t code = ~0U; | |
at_each_aligned_f<8, 4, 2, 1> | |
( | |
first, last, | |
[&] (const char* p64) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64)); | |
}, | |
[&] (const char* p32) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32)); | |
}, | |
[&] (const char* p16) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16)); | |
}, | |
[&] (const char* p8) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8)); | |
} | |
); | |
return ~code; | |
} | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// G // | |
// Generic version of algorithm D. // | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
namespace detail | |
{ | |
template <std::size_t ByteAlign, typename TChar, typename FApply> | |
TChar* at_each_aligned_g_impl(TChar* first, TChar* last, const FApply& apply) | |
{ | |
if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0) | |
{ | |
while (first + ByteAlign <= last) | |
{ | |
apply(first); | |
first += ByteAlign; | |
} | |
} | |
return first; | |
} | |
template <std::size_t ByteAlign, | |
std::size_t ByteAlignNext, | |
std::size_t... ByteAlignRest, | |
typename TChar, | |
typename FApply, | |
typename... FApplyRest | |
> | |
TChar* at_each_aligned_g_impl(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest) | |
{ | |
while (first + ByteAlign <= last | |
&& (reinterpret_cast<std::uintptr_t>(first) % ByteAlignNext != 0 || first + ByteAlignNext > last) | |
) | |
{ | |
apply(first); | |
first += ByteAlign; | |
} | |
first = at_each_aligned_g_impl<ByteAlignNext, ByteAlignRest...>(first, last, apply_rest...); | |
while (first + ByteAlign <= last) | |
{ | |
apply(first); | |
first += ByteAlign; | |
} | |
return first; | |
} | |
} | |
template <std::size_t... ByteAlign, typename... FApply> | |
void at_each_aligned_g(const char* first, const char* last, FApply&&... transform) | |
{ | |
detail::at_each_aligned_g_impl<ByteAlign...>(first, last, std::forward<FApply>(transform)...); | |
} | |
__attribute__((target("sse4.2"))) | |
std::uint32_t crc32g(const char* first, const char* last) | |
{ | |
std::uint32_t code = ~0U; | |
at_each_aligned_g<1, 2, 4, 8> | |
( | |
first, last, | |
[&] (const char* p8) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8)); | |
}, | |
[&] (const char* p16) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16)); | |
}, | |
[&] (const char* p32) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32)); | |
}, | |
[&] (const char* p64) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64)); | |
} | |
); | |
return ~code; | |
} | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// H // | |
// Inline version of algorithm G. // | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
namespace detail | |
{ | |
template <std::size_t ByteAlign, typename TChar, typename FApply> | |
__attribute__((always_inline)) inline | |
TChar* at_each_aligned_h_impl(TChar* first, TChar* last, const FApply& apply) | |
{ | |
if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0) | |
{ | |
while (first + ByteAlign <= last) | |
{ | |
apply(first); | |
first += ByteAlign; | |
} | |
} | |
return first; | |
} | |
template <std::size_t ByteAlign, | |
std::size_t ByteAlignNext, | |
std::size_t... ByteAlignRest, | |
typename TChar, | |
typename FApply, | |
typename... FApplyRest | |
> | |
__attribute__((always_inline)) inline | |
TChar* at_each_aligned_h_impl(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest) | |
{ | |
while (first + ByteAlign <= last | |
&& (reinterpret_cast<std::uintptr_t>(first) % ByteAlignNext != 0 || first + ByteAlignNext > last) | |
) | |
{ | |
apply(first); | |
first += ByteAlign; | |
} | |
first = at_each_aligned_h_impl<ByteAlignNext, ByteAlignRest...>(first, last, apply_rest...); | |
while (first + ByteAlign <= last) | |
{ | |
apply(first); | |
first += ByteAlign; | |
} | |
return first; | |
} | |
} | |
template <std::size_t... ByteAlign, typename... FApply> | |
__attribute__((always_inline)) inline | |
void at_each_aligned_h(const char* first, const char* last, FApply&&... transform) | |
{ | |
detail::at_each_aligned_h_impl<ByteAlign...>(first, last, std::forward<FApply>(transform)...); | |
} | |
__attribute__((target("sse4.2"))) | |
std::uint32_t crc32h(const char* first, const char* last) | |
{ | |
std::uint32_t code = ~0U; | |
at_each_aligned_h<1, 2, 4, 8> | |
( | |
first, last, | |
[&] (const char* p8) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8)); | |
}, | |
[&] (const char* p16) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16)); | |
}, | |
[&] (const char* p32) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32)); | |
}, | |
[&] (const char* p64) __attribute__((target("sse4.2"))) | |
{ | |
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64)); | |
} | |
); | |
return ~code; | |
} | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// Benchmarking // | |
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
template <typename T> | |
struct time_result | |
{ | |
T result; | |
std::chrono::nanoseconds elapsed; | |
std::int64_t elapsed_ticks; | |
time_result(T&& res, std::chrono::nanoseconds time, std::int64_t ticks) : | |
result(std::move(res)), | |
elapsed(time), | |
elapsed_ticks(ticks) | |
{ } | |
}; | |
static int64_t rdtscp() | |
{ | |
unsigned int aux; | |
return __rdtscp(&aux); | |
} | |
template <typename F> | |
auto time(F&& f) -> time_result<decltype(f())> | |
{ | |
using clock = std::chrono::steady_clock; | |
auto start_time = clock::now(); | |
auto start_tick = rdtscp(); | |
auto res = std::forward<F>(f)(); | |
auto end_time = clock::now(); | |
auto end_tick = rdtscp(); | |
return { std::move(res), end_time - start_time, end_tick - start_tick }; | |
} | |
template <typename T> | |
std::ostream& operator<<(std::ostream& os, const time_result<T>& x) | |
{ | |
os << "(result=" << x.result; | |
os << " time=" << std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(x.elapsed).count(); | |
os << ')'; | |
return os; | |
} | |
static void run_test(std::ostream& os, std::size_t data_size) | |
{ | |
const int trials = 5000; | |
using contents_type = std::tuple<std::uint32_t (*)(const char*, const char*), std::chrono::nanoseconds, std::int64_t>; | |
using algo_map_type = std::map<std::string, contents_type>; | |
algo_map_type algos = | |
{ | |
{ "Boost", { &crc32_boost, std::chrono::nanoseconds(0), 0 } }, | |
{ "a", { &crc32a, std::chrono::nanoseconds(0), 0 } }, | |
{ "b", { &crc32b, std::chrono::nanoseconds(0), 0 } }, | |
{ "c", { &crc32c, std::chrono::nanoseconds(0), 0 } }, | |
{ "d", { &crc32d, std::chrono::nanoseconds(0), 0 } }, | |
{ "e", { &crc32e, std::chrono::nanoseconds(0), 0 } }, | |
{ "f", { &crc32f, std::chrono::nanoseconds(0), 0 } }, | |
{ "g", { &crc32g, std::chrono::nanoseconds(0), 0 } }, | |
{ "h", { &crc32h, std::chrono::nanoseconds(0), 0 } }, | |
}; | |
std::vector<char> data(data_size); | |
std::mt19937 rng(std::random_device{}()); | |
std::uniform_int_distribution<> dist; | |
std::vector<algo_map_type::iterator> not_run_algos; | |
for (int i = 0; i < trials; ++i) | |
{ | |
std::generate(data.begin(), data.end(), [&] () { return dist(rng); }); | |
// You can un-comment the ~8 byte variation to make sure things still work with unaligned vectors. | |
const char* begin = data.data(); // + (dist(rng) % 8); | |
const char* end = data.data() + data.size(); // - (dist(rng) % 8); | |
for (auto iter = algos.begin(); iter != algos.end(); ++iter) | |
not_run_algos.emplace_back(iter); | |
std::shuffle(not_run_algos.begin(), not_run_algos.end(), rng); | |
std::uint32_t known = 0; | |
for ( ; !not_run_algos.empty(); not_run_algos.pop_back()) | |
{ | |
auto& runner = not_run_algos.back()->second; | |
auto res = time([&] { return std::get<0>(runner)(begin, end); }); | |
std::get<1>(runner) += res.elapsed; | |
std::get<2>(runner) += res.elapsed_ticks; | |
if (known == 0) | |
known = res.result; | |
else if (known != res.result) | |
throw std::runtime_error("Oh no!"); | |
} | |
} | |
auto nspb = [&] (std::chrono::nanoseconds x) | |
{ | |
return std::chrono::duration_cast<std::chrono::duration<double, std::nano>>(x) / trials / data_size; | |
}; | |
auto cycpb = [&] (std::int64_t x) { return double(x) / trials / data_size; }; | |
os << data_size; | |
for (const auto& entry : algos) | |
os << ',' << nspb(std::get<1>(entry.second)).count() | |
<< ',' << cycpb(std::get<2>(entry.second)) | |
; | |
os << std::endl; | |
} | |
int main() | |
{ | |
std::cout << "size,Boost_ns,Boost_cyc,a_ns,a_cyc,b_ns,b_cyc,c_ns,c_cyc,d_ns,d_cyc,e_ns,e_cyc,f_ns,f_cyc,g_ns,g_cyc,h_ns,h_cyc" << std::endl; | |
for (int shift = 8; shift <= 20; ++shift) | |
run_test(std::cout, 1 << shift); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment