Skip to content

Instantly share code, notes, and snippets.

@tgockel
Created January 12, 2017 21:14
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save tgockel/58381c41cb8b18fc8f5632159b3ce56f to your computer and use it in GitHub Desktop.
/** \file crc32.cpp
* A number of implementations of a CRC32 algorithm. This program will only work on a CPU with SSE 4.2 support.
*
* Compile with:
*
* $> g++ --std=c++17 crc32.cpp -O3 -fwhole-program
*
* The output is a CSV file showing the timings of the various implementations.
**/
#include <algorithm>
#include <chrono>
#include <cstdint>
#include <iomanip>
#include <iostream>
#include <map>
#include <random>
#include <stdexcept>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
#include <boost/crc.hpp>
#include <smmintrin.h>
#include <x86intrin.h>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Boost //
// Baseline. Uses bit shift and lookup table. //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::uint32_t crc32_boost(const char* first, const char* last)
{
boost::crc_optimal<32, 0x1edc6f41, 0xffffffff, 0xffffffff, true, true> machine;
machine.process_block(first, last);
return machine.checksum();
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// A //
// Simple implementation using the crc32b instruction. //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__attribute__((target("sse4.2")))
std::uint32_t crc32a(const char* first, const char* last)
{
std::uint32_t code = ~0U;
for ( ; first != last; ++first)
code = _mm_crc32_u8(code, *first);
return ~code;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// B //
// Same as A, but with the for_each algorithm. //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__attribute__((target("sse4.2")))
std::uint32_t crc32b(const char* first, const char* last)
{
std::uint32_t code = ~0U;
std::for_each(first, last,
[&] (const char& x) __attribute__((target("sse4.2"))) { code = _mm_crc32_u8(code, x); }
);
return ~code;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// C //
// Compute the next step with the largest chunk possible. //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__attribute__((target("sse4.2")))
std::uint32_t crc32c(const char* first, const char* last)
{
std::uint32_t code = ~0U;
while (first != last)
{
if (reinterpret_cast<std::uintptr_t>(first) % 8 == 0 && first + 8 <= last)
{
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(first));
first += 8;
}
else if (reinterpret_cast<std::uintptr_t>(first) % 4 == 0 && first + 4 <= last)
{
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(first));
first += 4;
}
else if (reinterpret_cast<std::uintptr_t>(first) % 2 == 0 && first + 2 <= last)
{
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(first));
first += 2;
}
else
{
code = _mm_crc32_u8(code, *reinterpret_cast<const std::uint8_t*>(first));
++first;
}
}
return ~code;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// D //
// Explicit striding. //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__attribute__((target("sse4.2")))
std::uint32_t crc32d(const char* first, const char* last)
{
std::uint32_t code = ~0U;
while (first + 1 <= last
&& (reinterpret_cast<std::uintptr_t>(first) % 2 != 0 || first + 2 > last)
)
{
code = _mm_crc32_u8(code, *reinterpret_cast<const std::uint8_t*>(first));
first += 1;
}
while (first + 2 <= last
&& (reinterpret_cast<std::uintptr_t>(first) % 4 != 0 || first + 4 > last)
)
{
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(first));
first += 2;
}
while (first + 4 <= last
&& (reinterpret_cast<std::uintptr_t>(first) % 8 != 0 || first + 8 > last)
)
{
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(first));
first += 4;
}
while (reinterpret_cast<std::uintptr_t>(first) % 8 == 0
&& first + 8 <= last
)
{
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(first));
first += 8;
}
while (first + 4 <= last)
{
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(first));
first += 4;
}
while (first + 2 <= last)
{
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(first));
first += 2;
}
while (first + 1 <= last)
{
code = _mm_crc32_u8(code, *reinterpret_cast<const std::uint8_t*>(first));
first += 1;
}
if (first != last)
throw std::runtime_error("");
return ~code;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// E //
// Geneic
// //
// Geneic version of algorithm C. //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail
{
template <std::size_t... ByteAlign>
struct at_each_aligned_e_impl;
template <std::size_t ByteAlign, std::size_t... ByteAlignRest>
struct at_each_aligned_e_impl<ByteAlign, ByteAlignRest...>
{
template <typename TChar, typename FApply, typename... FApplyRest>
static TChar* step(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
{
if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0
&& first + ByteAlign <= last
)
{
apply(first);
return first + ByteAlign;
}
else
{
return at_each_aligned_e_impl<ByteAlignRest...>::step(first, last, apply_rest...);
}
}
};
template <>
struct at_each_aligned_e_impl<1>
{
template <typename TChar, typename FApply>
static TChar* step(TChar* first, TChar*, const FApply& apply)
{
apply(first);
return first + 1;
}
};
template <std::size_t... ByteAlign, typename TChar, typename... FApply>
void at_each_aligned_e(TChar* first, TChar* last, FApply&&... transform)
{
while (first != last)
{
first = at_each_aligned_e_impl<ByteAlign...>::step(first, last, transform...);
}
}
}
template <std::size_t... ByteAlign, typename... FApply>
void at_each_aligned_e(const char* first, const char* last, FApply&&... transform)
{
return detail::at_each_aligned_e<ByteAlign...>(first, last,
std::forward<FApply>(transform)...
);
}
__attribute__((target("sse4.2")))
std::uint32_t crc32e(const char* first, const char* last)
{
std::uint32_t code = ~0U;
at_each_aligned_e<8, 4, 2, 1>
(
first, last,
[&] (const char* p64) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64));
},
[&] (const char* p32) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32));
},
[&] (const char* p16) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16));
},
[&] (const char* p8) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8));
}
);
return ~code;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// F //
// Same as algorithm E, but with always_inline attributes. //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail
{
template <std::size_t... ByteAlign>
struct at_each_aligned_f_impl;
template <std::size_t ByteAlign, std::size_t... ByteAlignRest>
struct at_each_aligned_f_impl<ByteAlign, ByteAlignRest...>
{
template <typename TChar, typename FApply, typename... FApplyRest>
__attribute__((always_inline)) inline
static TChar* step(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
{
if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0
&& first + ByteAlign <= last
)
{
apply(first);
return first + ByteAlign;
}
else
{
return at_each_aligned_f_impl<ByteAlignRest...>::step(first, last, apply_rest...);
}
}
};
template <>
struct at_each_aligned_f_impl<1>
{
template <typename TChar, typename FApply>
__attribute__((always_inline)) inline
static TChar* step(TChar* first, TChar*, const FApply& apply)
{
apply(first);
return first + 1;
}
};
template <std::size_t... ByteAlign, typename TChar, typename... FApply>
__attribute__((always_inline)) inline
void at_each_aligned_f(TChar* first, TChar* last, FApply&&... transform)
{
while (first != last)
{
first = at_each_aligned_f_impl<ByteAlign...>::step(first, last, transform...);
}
}
}
template <std::size_t... ByteAlign, typename... FApply>
__attribute__((always_inline)) inline
void at_each_aligned_f(const char* first, const char* last, FApply&&... transform)
{
return detail::at_each_aligned_f<ByteAlign...>(first, last,
std::forward<FApply>(transform)...
);
}
__attribute__((target("sse4.2")))
std::uint32_t crc32f(const char* first, const char* last)
{
std::uint32_t code = ~0U;
at_each_aligned_f<8, 4, 2, 1>
(
first, last,
[&] (const char* p64) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64));
},
[&] (const char* p32) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32));
},
[&] (const char* p16) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16));
},
[&] (const char* p8) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8));
}
);
return ~code;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// G //
// Generic version of algorithm D. //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail
{
template <std::size_t ByteAlign, typename TChar, typename FApply>
TChar* at_each_aligned_g_impl(TChar* first, TChar* last, const FApply& apply)
{
if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0)
{
while (first + ByteAlign <= last)
{
apply(first);
first += ByteAlign;
}
}
return first;
}
template <std::size_t ByteAlign,
std::size_t ByteAlignNext,
std::size_t... ByteAlignRest,
typename TChar,
typename FApply,
typename... FApplyRest
>
TChar* at_each_aligned_g_impl(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
{
while (first + ByteAlign <= last
&& (reinterpret_cast<std::uintptr_t>(first) % ByteAlignNext != 0 || first + ByteAlignNext > last)
)
{
apply(first);
first += ByteAlign;
}
first = at_each_aligned_g_impl<ByteAlignNext, ByteAlignRest...>(first, last, apply_rest...);
while (first + ByteAlign <= last)
{
apply(first);
first += ByteAlign;
}
return first;
}
}
template <std::size_t... ByteAlign, typename... FApply>
void at_each_aligned_g(const char* first, const char* last, FApply&&... transform)
{
detail::at_each_aligned_g_impl<ByteAlign...>(first, last, std::forward<FApply>(transform)...);
}
__attribute__((target("sse4.2")))
std::uint32_t crc32g(const char* first, const char* last)
{
std::uint32_t code = ~0U;
at_each_aligned_g<1, 2, 4, 8>
(
first, last,
[&] (const char* p8) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8));
},
[&] (const char* p16) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16));
},
[&] (const char* p32) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32));
},
[&] (const char* p64) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64));
}
);
return ~code;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// H //
// Inline version of algorithm G. //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail
{
template <std::size_t ByteAlign, typename TChar, typename FApply>
__attribute__((always_inline)) inline
TChar* at_each_aligned_h_impl(TChar* first, TChar* last, const FApply& apply)
{
if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0)
{
while (first + ByteAlign <= last)
{
apply(first);
first += ByteAlign;
}
}
return first;
}
template <std::size_t ByteAlign,
std::size_t ByteAlignNext,
std::size_t... ByteAlignRest,
typename TChar,
typename FApply,
typename... FApplyRest
>
__attribute__((always_inline)) inline
TChar* at_each_aligned_h_impl(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
{
while (first + ByteAlign <= last
&& (reinterpret_cast<std::uintptr_t>(first) % ByteAlignNext != 0 || first + ByteAlignNext > last)
)
{
apply(first);
first += ByteAlign;
}
first = at_each_aligned_h_impl<ByteAlignNext, ByteAlignRest...>(first, last, apply_rest...);
while (first + ByteAlign <= last)
{
apply(first);
first += ByteAlign;
}
return first;
}
}
template <std::size_t... ByteAlign, typename... FApply>
__attribute__((always_inline)) inline
void at_each_aligned_h(const char* first, const char* last, FApply&&... transform)
{
detail::at_each_aligned_h_impl<ByteAlign...>(first, last, std::forward<FApply>(transform)...);
}
__attribute__((target("sse4.2")))
std::uint32_t crc32h(const char* first, const char* last)
{
std::uint32_t code = ~0U;
at_each_aligned_h<1, 2, 4, 8>
(
first, last,
[&] (const char* p8) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8));
},
[&] (const char* p16) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16));
},
[&] (const char* p32) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32));
},
[&] (const char* p64) __attribute__((target("sse4.2")))
{
code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64));
}
);
return ~code;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Benchmarking //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename T>
struct time_result
{
T result;
std::chrono::nanoseconds elapsed;
std::int64_t elapsed_ticks;
time_result(T&& res, std::chrono::nanoseconds time, std::int64_t ticks) :
result(std::move(res)),
elapsed(time),
elapsed_ticks(ticks)
{ }
};
static int64_t rdtscp()
{
unsigned int aux;
return __rdtscp(&aux);
}
template <typename F>
auto time(F&& f) -> time_result<decltype(f())>
{
using clock = std::chrono::steady_clock;
auto start_time = clock::now();
auto start_tick = rdtscp();
auto res = std::forward<F>(f)();
auto end_time = clock::now();
auto end_tick = rdtscp();
return { std::move(res), end_time - start_time, end_tick - start_tick };
}
template <typename T>
std::ostream& operator<<(std::ostream& os, const time_result<T>& x)
{
os << "(result=" << x.result;
os << " time=" << std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(x.elapsed).count();
os << ')';
return os;
}
static void run_test(std::ostream& os, std::size_t data_size)
{
const int trials = 5000;
using contents_type = std::tuple<std::uint32_t (*)(const char*, const char*), std::chrono::nanoseconds, std::int64_t>;
using algo_map_type = std::map<std::string, contents_type>;
algo_map_type algos =
{
{ "Boost", { &crc32_boost, std::chrono::nanoseconds(0), 0 } },
{ "a", { &crc32a, std::chrono::nanoseconds(0), 0 } },
{ "b", { &crc32b, std::chrono::nanoseconds(0), 0 } },
{ "c", { &crc32c, std::chrono::nanoseconds(0), 0 } },
{ "d", { &crc32d, std::chrono::nanoseconds(0), 0 } },
{ "e", { &crc32e, std::chrono::nanoseconds(0), 0 } },
{ "f", { &crc32f, std::chrono::nanoseconds(0), 0 } },
{ "g", { &crc32g, std::chrono::nanoseconds(0), 0 } },
{ "h", { &crc32h, std::chrono::nanoseconds(0), 0 } },
};
std::vector<char> data(data_size);
std::mt19937 rng(std::random_device{}());
std::uniform_int_distribution<> dist;
std::vector<algo_map_type::iterator> not_run_algos;
for (int i = 0; i < trials; ++i)
{
std::generate(data.begin(), data.end(), [&] () { return dist(rng); });
// You can un-comment the ~8 byte variation to make sure things still work with unaligned vectors.
const char* begin = data.data(); // + (dist(rng) % 8);
const char* end = data.data() + data.size(); // - (dist(rng) % 8);
for (auto iter = algos.begin(); iter != algos.end(); ++iter)
not_run_algos.emplace_back(iter);
std::shuffle(not_run_algos.begin(), not_run_algos.end(), rng);
std::uint32_t known = 0;
for ( ; !not_run_algos.empty(); not_run_algos.pop_back())
{
auto& runner = not_run_algos.back()->second;
auto res = time([&] { return std::get<0>(runner)(begin, end); });
std::get<1>(runner) += res.elapsed;
std::get<2>(runner) += res.elapsed_ticks;
if (known == 0)
known = res.result;
else if (known != res.result)
throw std::runtime_error("Oh no!");
}
}
auto nspb = [&] (std::chrono::nanoseconds x)
{
return std::chrono::duration_cast<std::chrono::duration<double, std::nano>>(x) / trials / data_size;
};
auto cycpb = [&] (std::int64_t x) { return double(x) / trials / data_size; };
os << data_size;
for (const auto& entry : algos)
os << ',' << nspb(std::get<1>(entry.second)).count()
<< ',' << cycpb(std::get<2>(entry.second))
;
os << std::endl;
}
int main()
{
std::cout << "size,Boost_ns,Boost_cyc,a_ns,a_cyc,b_ns,b_cyc,c_ns,c_cyc,d_ns,d_cyc,e_ns,e_cyc,f_ns,f_cyc,g_ns,g_cyc,h_ns,h_cyc" << std::endl;
for (int shift = 8; shift <= 20; ++shift)
run_test(std::cout, 1 << shift);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment