tgockel/crc32.cpp

## crc32.cpp
/** \file crc32.cpp
 *  A number of implementations of a CRC32 algorithm. This program will only work on a CPU with SSE 4.2 support.
 *
 *  Compile with:
 *
 *  $> g++ --std=c++17 crc32.cpp -O3 -fwhole-program
 *
 *  The output is a CSV file showing the timings of the various implementations.
**/

#include <algorithm>
#include <chrono>
#include <cstdint>
#include <iomanip>
#include <iostream>
#include <map>
#include <random>
#include <stdexcept>
#include <string>
#include <tuple>
#include <utility>
#include <vector>

#include <boost/crc.hpp>

#include <smmintrin.h>
#include <x86intrin.h>

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Boost                                                                                                              //
// Baseline. Uses bit shift and lookup table.                                                                         //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

std::uint32_t crc32_boost(const char* first, const char* last)
{
    boost::crc_optimal<32, 0x1edc6f41, 0xffffffff, 0xffffffff, true, true> machine;
    machine.process_block(first, last);
    return machine.checksum();
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// A                                                                                                                  //
// Simple implementation using the crc32b instruction.                                                                //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

__attribute__((target("sse4.2")))
std::uint32_t crc32a(const char* first, const char* last)
{
    std::uint32_t code = ~0U;
    for ( ; first != last; ++first)
        code = _mm_crc32_u8(code, *first);
    return ~code;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// B                                                                                                                  //
// Same as A, but with the for_each algorithm.                                                                        //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

__attribute__((target("sse4.2")))
std::uint32_t crc32b(const char* first, const char* last)
{
    std::uint32_t code = ~0U;
    std::for_each(first, last,
                  [&] (const char& x) __attribute__((target("sse4.2"))) { code = _mm_crc32_u8(code, x); }
                 );
    return ~code;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// C                                                                                                                  //
// Compute the next step with the largest chunk possible.                                                             //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

__attribute__((target("sse4.2")))
std::uint32_t crc32c(const char* first, const char* last)
{
    std::uint32_t code = ~0U;

    while (first != last)
    {
        if (reinterpret_cast<std::uintptr_t>(first) % 8 == 0 && first + 8 <= last)
        {
            code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(first));
            first += 8;
        }
        else if (reinterpret_cast<std::uintptr_t>(first) % 4 == 0 && first + 4 <= last)
        {
            code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(first));
            first += 4;
        }
        else if (reinterpret_cast<std::uintptr_t>(first) % 2 == 0 && first + 2 <= last)
        {
            code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(first));
            first += 2;
        }
        else
        {
            code = _mm_crc32_u8(code, *reinterpret_cast<const std::uint8_t*>(first));
            ++first;
        }
    }

    return ~code;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// D                                                                                                                  //
// Explicit striding.                                                                                                 //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

__attribute__((target("sse4.2")))
std::uint32_t crc32d(const char* first, const char* last)
{
    std::uint32_t code = ~0U;

    while (first + 1 <= last
          && (reinterpret_cast<std::uintptr_t>(first) % 2 != 0 || first + 2 > last)
          )
    {
        code = _mm_crc32_u8(code, *reinterpret_cast<const std::uint8_t*>(first));
        first += 1;
    }

    while (first + 2 <= last
          && (reinterpret_cast<std::uintptr_t>(first) % 4 != 0 || first + 4 > last)
          )
    {
        code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(first));
        first += 2;
    }

    while (first + 4 <= last
          && (reinterpret_cast<std::uintptr_t>(first) % 8 != 0 || first + 8 > last)
          )
    {
        code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(first));
        first += 4;
    }

    while (reinterpret_cast<std::uintptr_t>(first) % 8 == 0
          && first + 8 <= last
          )
    {
        code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(first));
        first += 8;
    }

    while (first + 4 <= last)
    {
        code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(first));
        first += 4;
    }

    while (first + 2 <= last)
    {
        code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(first));
        first += 2;
    }

    while (first + 1 <= last)
    {
        code = _mm_crc32_u8(code, *reinterpret_cast<const std::uint8_t*>(first));
        first += 1;
    }

    if (first != last)
        throw std::runtime_error("");
    return ~code;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// E                                                                                                                  //
// Geneic
//                                                                                                                    //
// Geneic version of algorithm C.                                                                                     //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

namespace detail
{

template <std::size_t... ByteAlign>
struct at_each_aligned_e_impl;

template <std::size_t ByteAlign, std::size_t... ByteAlignRest>
struct at_each_aligned_e_impl<ByteAlign, ByteAlignRest...>
{
    template <typename TChar, typename FApply, typename... FApplyRest>
    static TChar* step(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
    {
        if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0
           && first + ByteAlign <= last
           )
        {
            apply(first);
            return first + ByteAlign;
        }
        else
        {
            return at_each_aligned_e_impl<ByteAlignRest...>::step(first, last, apply_rest...);
        }
    }
};

template <>
struct at_each_aligned_e_impl<1>
{
    template <typename TChar, typename FApply>
    static TChar* step(TChar* first, TChar*, const FApply& apply)
    {
        apply(first);
        return first + 1;
    }
};

template <std::size_t... ByteAlign, typename TChar, typename... FApply>
void at_each_aligned_e(TChar* first, TChar* last, FApply&&... transform)
{
    while (first != last)
    {
        first = at_each_aligned_e_impl<ByteAlign...>::step(first, last, transform...);
    }
}

}

template <std::size_t... ByteAlign, typename... FApply>
void at_each_aligned_e(const char* first, const char* last, FApply&&... transform)
{
    return detail::at_each_aligned_e<ByteAlign...>(first, last,
                                                    std::forward<FApply>(transform)...
                                                   );
}

__attribute__((target("sse4.2")))
std::uint32_t crc32e(const char* first, const char* last)
{
    std::uint32_t code = ~0U;

    at_each_aligned_e<8, 4, 2, 1>
    (
        first, last,
        [&] (const char* p64) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64));
        },
        [&] (const char* p32) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32));
        },
        [&] (const char* p16) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16));
        },
        [&] (const char* p8)  __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8));
        }
    );

    return ~code;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// F                                                                                                                  //
// Same as algorithm E, but with always_inline attributes.                                                            //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

namespace detail
{

template <std::size_t... ByteAlign>
struct at_each_aligned_f_impl;

template <std::size_t ByteAlign, std::size_t... ByteAlignRest>
struct at_each_aligned_f_impl<ByteAlign, ByteAlignRest...>
{
    template <typename TChar, typename FApply, typename... FApplyRest>
    __attribute__((always_inline)) inline
    static TChar* step(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
    {
        if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0
           && first + ByteAlign <= last
           )
        {
            apply(first);
            return first + ByteAlign;
        }
        else
        {
            return at_each_aligned_f_impl<ByteAlignRest...>::step(first, last, apply_rest...);
        }
    }
};

template <>
struct at_each_aligned_f_impl<1>
{
    template <typename TChar, typename FApply>
    __attribute__((always_inline)) inline
    static TChar* step(TChar* first, TChar*, const FApply& apply)
    {
        apply(first);
        return first + 1;
    }
};

template <std::size_t... ByteAlign, typename TChar, typename... FApply>
__attribute__((always_inline)) inline
void at_each_aligned_f(TChar* first, TChar* last, FApply&&... transform)
{
    while (first != last)
    {
        first = at_each_aligned_f_impl<ByteAlign...>::step(first, last, transform...);
    }
}

}

template <std::size_t... ByteAlign, typename... FApply>
__attribute__((always_inline)) inline
void at_each_aligned_f(const char* first, const char* last, FApply&&... transform)
{
    return detail::at_each_aligned_f<ByteAlign...>(first, last,
                                                    std::forward<FApply>(transform)...
                                                   );
}

__attribute__((target("sse4.2")))
std::uint32_t crc32f(const char* first, const char* last)
{
    std::uint32_t code = ~0U;

    at_each_aligned_f<8, 4, 2, 1>
    (
        first, last,
        [&] (const char* p64) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64));
        },
        [&] (const char* p32) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32));
        },
        [&] (const char* p16) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16));
        },
        [&] (const char* p8)  __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8));
        }
    );

    return ~code;
}


////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// G                                                                                                                  //
// Generic version of algorithm D.                                                                                    //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

namespace detail
{

template <std::size_t ByteAlign, typename TChar, typename FApply>
TChar* at_each_aligned_g_impl(TChar* first, TChar* last, const FApply& apply)
{
    if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0)
    {
        while (first + ByteAlign <= last)
        {
            apply(first);
            first += ByteAlign;
        }
    }
    return first;
}

template <std::size_t ByteAlign,
          std::size_t ByteAlignNext,
          std::size_t... ByteAlignRest,
          typename TChar,
          typename FApply,
          typename... FApplyRest
         >
TChar* at_each_aligned_g_impl(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
{
    while (first + ByteAlign <= last
          && (reinterpret_cast<std::uintptr_t>(first) % ByteAlignNext != 0 || first + ByteAlignNext > last)
          )
    {
        apply(first);
        first += ByteAlign;
    }

    first = at_each_aligned_g_impl<ByteAlignNext, ByteAlignRest...>(first, last, apply_rest...);

    while (first + ByteAlign <= last)
    {
        apply(first);
        first += ByteAlign;
    }

    return first;
}

}

template <std::size_t... ByteAlign, typename... FApply>
void at_each_aligned_g(const char* first, const char* last, FApply&&... transform)
{
    detail::at_each_aligned_g_impl<ByteAlign...>(first, last, std::forward<FApply>(transform)...);
}

__attribute__((target("sse4.2")))
std::uint32_t crc32g(const char* first, const char* last)
{
    std::uint32_t code = ~0U;

    at_each_aligned_g<1, 2, 4, 8>
    (
        first, last,
        [&] (const char* p8)  __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8));
        },
        [&] (const char* p16) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16));
        },
        [&] (const char* p32) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32));
        },
        [&] (const char* p64) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64));
        }
    );
    return ~code;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// H                                                                                                                  //
// Inline version of algorithm G.                                                                                     //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

namespace detail
{

template <std::size_t ByteAlign, typename TChar, typename FApply>
__attribute__((always_inline)) inline
TChar* at_each_aligned_h_impl(TChar* first, TChar* last, const FApply& apply)
{
    if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0)
    {
        while (first + ByteAlign <= last)
        {
            apply(first);
            first += ByteAlign;
        }
    }
    return first;
}

template <std::size_t ByteAlign,
          std::size_t ByteAlignNext,
          std::size_t... ByteAlignRest,
          typename TChar,
          typename FApply,
          typename... FApplyRest
         >
__attribute__((always_inline)) inline
TChar* at_each_aligned_h_impl(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
{
    while (first + ByteAlign <= last
          && (reinterpret_cast<std::uintptr_t>(first) % ByteAlignNext != 0 || first + ByteAlignNext > last)
          )
    {
        apply(first);
        first += ByteAlign;
    }

    first = at_each_aligned_h_impl<ByteAlignNext, ByteAlignRest...>(first, last, apply_rest...);

    while (first + ByteAlign <= last)
    {
        apply(first);
        first += ByteAlign;
    }

    return first;
}

}

template <std::size_t... ByteAlign, typename... FApply>
__attribute__((always_inline)) inline
void at_each_aligned_h(const char* first, const char* last, FApply&&... transform)
{
    detail::at_each_aligned_h_impl<ByteAlign...>(first, last, std::forward<FApply>(transform)...);
}

__attribute__((target("sse4.2")))
std::uint32_t crc32h(const char* first, const char* last)
{
    std::uint32_t code = ~0U;

    at_each_aligned_h<1, 2, 4, 8>
    (
        first, last,
        [&] (const char* p8)  __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u8 (code, *reinterpret_cast<const std::uint8_t*> (p8));
        },
        [&] (const char* p16) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u16(code, *reinterpret_cast<const std::uint16_t*>(p16));
        },
        [&] (const char* p32) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u32(code, *reinterpret_cast<const std::uint32_t*>(p32));
        },
        [&] (const char* p64) __attribute__((target("sse4.2")))
        {
            code = _mm_crc32_u64(code, *reinterpret_cast<const std::uint64_t*>(p64));
        }
    );
    return ~code;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Benchmarking                                                                                                       //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct time_result
{
    T                        result;
    std::chrono::nanoseconds elapsed;
    std::int64_t             elapsed_ticks;

    time_result(T&& res, std::chrono::nanoseconds time, std::int64_t ticks) :
            result(std::move(res)),
            elapsed(time),
            elapsed_ticks(ticks)
    { }
};

static int64_t rdtscp()
{
    unsigned int aux;
    return __rdtscp(&aux);
}

template <typename F>
auto time(F&& f) -> time_result<decltype(f())>
{
    using clock = std::chrono::steady_clock;

    auto start_time = clock::now();
    auto start_tick = rdtscp();
    auto res = std::forward<F>(f)();
    auto end_time = clock::now();
    auto end_tick = rdtscp();

    return { std::move(res), end_time - start_time, end_tick - start_tick };
}

template <typename T>
std::ostream& operator<<(std::ostream& os, const time_result<T>& x)
{
    os << "(result=" << x.result;
    os << " time=" << std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(x.elapsed).count();
    os << ')';
    return os;
}

static void run_test(std::ostream& os, std::size_t data_size)
{
    const int trials = 5000;

    using contents_type = std::tuple<std::uint32_t (*)(const char*, const char*), std::chrono::nanoseconds, std::int64_t>;
    using algo_map_type = std::map<std::string, contents_type>;

    algo_map_type algos =
    {
        { "Boost",  { &crc32_boost, std::chrono::nanoseconds(0), 0 } },
        { "a", { &crc32a, std::chrono::nanoseconds(0), 0 } },
        { "b", { &crc32b, std::chrono::nanoseconds(0), 0 } },
        { "c", { &crc32c, std::chrono::nanoseconds(0), 0 } },
        { "d", { &crc32d, std::chrono::nanoseconds(0), 0 } },
        { "e", { &crc32e, std::chrono::nanoseconds(0), 0 } },
        { "f", { &crc32f, std::chrono::nanoseconds(0), 0 } },
        { "g", { &crc32g, std::chrono::nanoseconds(0), 0 } },
        { "h", { &crc32h, std::chrono::nanoseconds(0), 0 } },
    };

    std::vector<char> data(data_size);
    std::mt19937 rng(std::random_device{}());
    std::uniform_int_distribution<> dist;
    std::vector<algo_map_type::iterator> not_run_algos;

    for (int i = 0; i < trials; ++i)
    {
        std::generate(data.begin(), data.end(), [&] () { return dist(rng); });

        // You can un-comment the ~8 byte variation to make sure things still work with unaligned vectors.
        const char* begin = data.data(); // + (dist(rng) % 8);
        const char* end   = data.data() + data.size(); // - (dist(rng) % 8);

        for (auto iter = algos.begin(); iter != algos.end(); ++iter)
            not_run_algos.emplace_back(iter);
        std::shuffle(not_run_algos.begin(), not_run_algos.end(), rng);

        std::uint32_t known = 0;
        for ( ; !not_run_algos.empty(); not_run_algos.pop_back())
        {
            auto& runner = not_run_algos.back()->second;
            auto res = time([&] { return std::get<0>(runner)(begin, end); });
            std::get<1>(runner) += res.elapsed;
            std::get<2>(runner) += res.elapsed_ticks;

            if (known == 0)
                known = res.result;
            else if (known != res.result)
                throw std::runtime_error("Oh no!");
        }
    }

    auto nspb = [&] (std::chrono::nanoseconds x)
                {
                    return std::chrono::duration_cast<std::chrono::duration<double, std::nano>>(x) / trials / data_size;
                };
    auto cycpb = [&] (std::int64_t x) { return double(x) / trials / data_size; };

    os << data_size;
    for (const auto& entry : algos)
        os << ',' << nspb(std::get<1>(entry.second)).count()
           << ',' << cycpb(std::get<2>(entry.second))
           ;
    os << std::endl;
}

int main()
{
    std::cout << "size,Boost_ns,Boost_cyc,a_ns,a_cyc,b_ns,b_cyc,c_ns,c_cyc,d_ns,d_cyc,e_ns,e_cyc,f_ns,f_cyc,g_ns,g_cyc,h_ns,h_cyc" << std::endl;
    for (int shift = 8; shift <= 20; ++shift)
        run_test(std::cout, 1 << shift);
}
	/** \file crc32.cpp
	* A number of implementations of a CRC32 algorithm. This program will only work on a CPU with SSE 4.2 support.
	*
	* Compile with:
	*
	* $> g++ --std=c++17 crc32.cpp -O3 -fwhole-program
	*
	* The output is a CSV file showing the timings of the various implementations.
	**/

	#include <algorithm>
	#include <chrono>
	#include <cstdint>
	#include <iomanip>
	#include <iostream>
	#include <map>
	#include <random>
	#include <stdexcept>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	#include <boost/crc.hpp>

	#include <smmintrin.h>
	#include <x86intrin.h>

	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	// Boost //
	// Baseline. Uses bit shift and lookup table. //
	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	std::uint32_t crc32_boost(const char* first, const char* last)
	{
	boost::crc_optimal<32, 0x1edc6f41, 0xffffffff, 0xffffffff, true, true> machine;
	machine.process_block(first, last);
	return machine.checksum();
	}

	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	// A //
	// Simple implementation using the crc32b instruction. //
	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	__attribute__((target("sse4.2")))
	std::uint32_t crc32a(const char* first, const char* last)
	{
	std::uint32_t code = ~0U;
	for ( ; first != last; ++first)
	code = _mm_crc32_u8(code, *first);
	return ~code;
	}

	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	// B //
	// Same as A, but with the for_each algorithm. //
	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	__attribute__((target("sse4.2")))
	std::uint32_t crc32b(const char* first, const char* last)
	{
	std::uint32_t code = ~0U;
	std::for_each(first, last,
	[&] (const char& x) __attribute__((target("sse4.2"))) { code = _mm_crc32_u8(code, x); }
	);
	return ~code;
	}

	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	// C //
	// Compute the next step with the largest chunk possible. //
	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	__attribute__((target("sse4.2")))
	std::uint32_t crc32c(const char* first, const char* last)
	{
	std::uint32_t code = ~0U;

	while (first != last)
	{
	if (reinterpret_cast<std::uintptr_t>(first) % 8 == 0 && first + 8 <= last)
	{
	code = _mm_crc32_u64(code, reinterpret_cast<const std::uint64_t>(first));
	first += 8;
	}
	else if (reinterpret_cast<std::uintptr_t>(first) % 4 == 0 && first + 4 <= last)
	{
	code = _mm_crc32_u32(code, reinterpret_cast<const std::uint32_t>(first));
	first += 4;
	}
	else if (reinterpret_cast<std::uintptr_t>(first) % 2 == 0 && first + 2 <= last)
	{
	code = _mm_crc32_u16(code, reinterpret_cast<const std::uint16_t>(first));
	first += 2;
	}
	else
	{
	code = _mm_crc32_u8(code, reinterpret_cast<const std::uint8_t>(first));
	++first;
	}
	}

	return ~code;
	}

	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	// D //
	// Explicit striding. //
	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	__attribute__((target("sse4.2")))
	std::uint32_t crc32d(const char* first, const char* last)
	{
	std::uint32_t code = ~0U;

	while (first + 1 <= last
	&& (reinterpret_cast<std::uintptr_t>(first) % 2 != 0 \|\| first + 2 > last)
	)
	{
	code = _mm_crc32_u8(code, reinterpret_cast<const std::uint8_t>(first));
	first += 1;
	}

	while (first + 2 <= last
	&& (reinterpret_cast<std::uintptr_t>(first) % 4 != 0 \|\| first + 4 > last)
	)
	{
	code = _mm_crc32_u16(code, reinterpret_cast<const std::uint16_t>(first));
	first += 2;
	}

	while (first + 4 <= last
	&& (reinterpret_cast<std::uintptr_t>(first) % 8 != 0 \|\| first + 8 > last)
	)
	{
	code = _mm_crc32_u32(code, reinterpret_cast<const std::uint32_t>(first));
	first += 4;
	}

	while (reinterpret_cast<std::uintptr_t>(first) % 8 == 0
	&& first + 8 <= last
	)
	{
	code = _mm_crc32_u64(code, reinterpret_cast<const std::uint64_t>(first));
	first += 8;
	}

	while (first + 4 <= last)
	{
	code = _mm_crc32_u32(code, reinterpret_cast<const std::uint32_t>(first));
	first += 4;
	}

	while (first + 2 <= last)
	{
	code = _mm_crc32_u16(code, reinterpret_cast<const std::uint16_t>(first));
	first += 2;
	}

	while (first + 1 <= last)
	{
	code = _mm_crc32_u8(code, reinterpret_cast<const std::uint8_t>(first));
	first += 1;
	}

	if (first != last)
	throw std::runtime_error("");
	return ~code;
	}

	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	// E //
	// Geneic
	// //
	// Geneic version of algorithm C. //
	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	namespace detail
	{

	template <std::size_t... ByteAlign>
	struct at_each_aligned_e_impl;

	template <std::size_t ByteAlign, std::size_t... ByteAlignRest>
	struct at_each_aligned_e_impl<ByteAlign, ByteAlignRest...>
	{
	template <typename TChar, typename FApply, typename... FApplyRest>
	static TChar* step(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
	{
	if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0
	&& first + ByteAlign <= last
	)
	{
	apply(first);
	return first + ByteAlign;
	}
	else
	{
	return at_each_aligned_e_impl<ByteAlignRest...>::step(first, last, apply_rest...);
	}
	}
	};

	template <>
	struct at_each_aligned_e_impl<1>
	{
	template <typename TChar, typename FApply>
	static TChar* step(TChar* first, TChar*, const FApply& apply)
	{
	apply(first);
	return first + 1;
	}
	};

	template <std::size_t... ByteAlign, typename TChar, typename... FApply>
	void at_each_aligned_e(TChar* first, TChar* last, FApply&&... transform)
	{
	while (first != last)
	{
	first = at_each_aligned_e_impl<ByteAlign...>::step(first, last, transform...);
	}
	}

	}

	template <std::size_t... ByteAlign, typename... FApply>
	void at_each_aligned_e(const char* first, const char* last, FApply&&... transform)
	{
	return detail::at_each_aligned_e<ByteAlign...>(first, last,
	std::forward<FApply>(transform)...
	);
	}

	__attribute__((target("sse4.2")))
	std::uint32_t crc32e(const char* first, const char* last)
	{
	std::uint32_t code = ~0U;

	at_each_aligned_e<8, 4, 2, 1>
	(
	first, last,
	[&] (const char* p64) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u64(code, reinterpret_cast<const std::uint64_t>(p64));
	},
	[&] (const char* p32) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u32(code, reinterpret_cast<const std::uint32_t>(p32));
	},
	[&] (const char* p16) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u16(code, reinterpret_cast<const std::uint16_t>(p16));
	},
	[&] (const char* p8) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u8 (code, reinterpret_cast<const std::uint8_t> (p8));
	}
	);

	return ~code;
	}

	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	// F //
	// Same as algorithm E, but with always_inline attributes. //
	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	namespace detail
	{

	template <std::size_t... ByteAlign>
	struct at_each_aligned_f_impl;

	template <std::size_t ByteAlign, std::size_t... ByteAlignRest>
	struct at_each_aligned_f_impl<ByteAlign, ByteAlignRest...>
	{
	template <typename TChar, typename FApply, typename... FApplyRest>
	__attribute__((always_inline)) inline
	static TChar* step(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
	{
	if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0
	&& first + ByteAlign <= last
	)
	{
	apply(first);
	return first + ByteAlign;
	}
	else
	{
	return at_each_aligned_f_impl<ByteAlignRest...>::step(first, last, apply_rest...);
	}
	}
	};

	template <>
	struct at_each_aligned_f_impl<1>
	{
	template <typename TChar, typename FApply>
	__attribute__((always_inline)) inline
	static TChar* step(TChar* first, TChar*, const FApply& apply)
	{
	apply(first);
	return first + 1;
	}
	};

	template <std::size_t... ByteAlign, typename TChar, typename... FApply>
	__attribute__((always_inline)) inline
	void at_each_aligned_f(TChar* first, TChar* last, FApply&&... transform)
	{
	while (first != last)
	{
	first = at_each_aligned_f_impl<ByteAlign...>::step(first, last, transform...);
	}
	}

	}

	template <std::size_t... ByteAlign, typename... FApply>
	__attribute__((always_inline)) inline
	void at_each_aligned_f(const char* first, const char* last, FApply&&... transform)
	{
	return detail::at_each_aligned_f<ByteAlign...>(first, last,
	std::forward<FApply>(transform)...
	);
	}

	__attribute__((target("sse4.2")))
	std::uint32_t crc32f(const char* first, const char* last)
	{
	std::uint32_t code = ~0U;

	at_each_aligned_f<8, 4, 2, 1>
	(
	first, last,
	[&] (const char* p64) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u64(code, reinterpret_cast<const std::uint64_t>(p64));
	},
	[&] (const char* p32) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u32(code, reinterpret_cast<const std::uint32_t>(p32));
	},
	[&] (const char* p16) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u16(code, reinterpret_cast<const std::uint16_t>(p16));
	},
	[&] (const char* p8) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u8 (code, reinterpret_cast<const std::uint8_t> (p8));
	}
	);

	return ~code;
	}


	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	// G //
	// Generic version of algorithm D. //
	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	namespace detail
	{

	template <std::size_t ByteAlign, typename TChar, typename FApply>
	TChar* at_each_aligned_g_impl(TChar* first, TChar* last, const FApply& apply)
	{
	if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0)
	{
	while (first + ByteAlign <= last)
	{
	apply(first);
	first += ByteAlign;
	}
	}
	return first;
	}

	template <std::size_t ByteAlign,
	std::size_t ByteAlignNext,
	std::size_t... ByteAlignRest,
	typename TChar,
	typename FApply,
	typename... FApplyRest
	>
	TChar* at_each_aligned_g_impl(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
	{
	while (first + ByteAlign <= last
	&& (reinterpret_cast<std::uintptr_t>(first) % ByteAlignNext != 0 \|\| first + ByteAlignNext > last)
	)
	{
	apply(first);
	first += ByteAlign;
	}

	first = at_each_aligned_g_impl<ByteAlignNext, ByteAlignRest...>(first, last, apply_rest...);

	while (first + ByteAlign <= last)
	{
	apply(first);
	first += ByteAlign;
	}

	return first;
	}

	}

	template <std::size_t... ByteAlign, typename... FApply>
	void at_each_aligned_g(const char* first, const char* last, FApply&&... transform)
	{
	detail::at_each_aligned_g_impl<ByteAlign...>(first, last, std::forward<FApply>(transform)...);
	}

	__attribute__((target("sse4.2")))
	std::uint32_t crc32g(const char* first, const char* last)
	{
	std::uint32_t code = ~0U;

	at_each_aligned_g<1, 2, 4, 8>
	(
	first, last,
	[&] (const char* p8) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u8 (code, reinterpret_cast<const std::uint8_t> (p8));
	},
	[&] (const char* p16) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u16(code, reinterpret_cast<const std::uint16_t>(p16));
	},
	[&] (const char* p32) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u32(code, reinterpret_cast<const std::uint32_t>(p32));
	},
	[&] (const char* p64) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u64(code, reinterpret_cast<const std::uint64_t>(p64));
	}
	);
	return ~code;
	}

	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	// H //
	// Inline version of algorithm G. //
	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	namespace detail
	{

	template <std::size_t ByteAlign, typename TChar, typename FApply>
	__attribute__((always_inline)) inline
	TChar* at_each_aligned_h_impl(TChar* first, TChar* last, const FApply& apply)
	{
	if (reinterpret_cast<std::uintptr_t>(first) % ByteAlign == 0)
	{
	while (first + ByteAlign <= last)
	{
	apply(first);
	first += ByteAlign;
	}
	}
	return first;
	}

	template <std::size_t ByteAlign,
	std::size_t ByteAlignNext,
	std::size_t... ByteAlignRest,
	typename TChar,
	typename FApply,
	typename... FApplyRest
	>
	__attribute__((always_inline)) inline
	TChar* at_each_aligned_h_impl(TChar* first, TChar* last, const FApply& apply, const FApplyRest&... apply_rest)
	{
	while (first + ByteAlign <= last
	&& (reinterpret_cast<std::uintptr_t>(first) % ByteAlignNext != 0 \|\| first + ByteAlignNext > last)
	)
	{
	apply(first);
	first += ByteAlign;
	}

	first = at_each_aligned_h_impl<ByteAlignNext, ByteAlignRest...>(first, last, apply_rest...);

	while (first + ByteAlign <= last)
	{
	apply(first);
	first += ByteAlign;
	}

	return first;
	}

	}

	template <std::size_t... ByteAlign, typename... FApply>
	__attribute__((always_inline)) inline
	void at_each_aligned_h(const char* first, const char* last, FApply&&... transform)
	{
	detail::at_each_aligned_h_impl<ByteAlign...>(first, last, std::forward<FApply>(transform)...);
	}

	__attribute__((target("sse4.2")))
	std::uint32_t crc32h(const char* first, const char* last)
	{
	std::uint32_t code = ~0U;

	at_each_aligned_h<1, 2, 4, 8>
	(
	first, last,
	[&] (const char* p8) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u8 (code, reinterpret_cast<const std::uint8_t> (p8));
	},
	[&] (const char* p16) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u16(code, reinterpret_cast<const std::uint16_t>(p16));
	},
	[&] (const char* p32) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u32(code, reinterpret_cast<const std::uint32_t>(p32));
	},
	[&] (const char* p64) __attribute__((target("sse4.2")))
	{
	code = _mm_crc32_u64(code, reinterpret_cast<const std::uint64_t>(p64));
	}
	);
	return ~code;
	}

	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	// Benchmarking //
	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	template <typename T>
	struct time_result
	{
	T result;
	std::chrono::nanoseconds elapsed;
	std::int64_t elapsed_ticks;

	time_result(T&& res, std::chrono::nanoseconds time, std::int64_t ticks) :
	result(std::move(res)),
	elapsed(time),
	elapsed_ticks(ticks)
	{ }
	};

	static int64_t rdtscp()
	{
	unsigned int aux;
	return __rdtscp(&aux);
	}

	template <typename F>
	auto time(F&& f) -> time_result<decltype(f())>
	{
	using clock = std::chrono::steady_clock;

	auto start_time = clock::now();
	auto start_tick = rdtscp();
	auto res = std::forward<F>(f)();
	auto end_time = clock::now();
	auto end_tick = rdtscp();

	return { std::move(res), end_time - start_time, end_tick - start_tick };
	}

	template <typename T>
	std::ostream& operator<<(std::ostream& os, const time_result<T>& x)
	{
	os << "(result=" << x.result;
	os << " time=" << std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(x.elapsed).count();
	os << ')';
	return os;
	}

	static void run_test(std::ostream& os, std::size_t data_size)
	{
	const int trials = 5000;

	using contents_type = std::tuple<std::uint32_t ()(const char, const char*), std::chrono::nanoseconds, std::int64_t>;
	using algo_map_type = std::map<std::string, contents_type>;

	algo_map_type algos =
	{
	{ "Boost", { &crc32_boost, std::chrono::nanoseconds(0), 0 } },
	{ "a", { &crc32a, std::chrono::nanoseconds(0), 0 } },
	{ "b", { &crc32b, std::chrono::nanoseconds(0), 0 } },
	{ "c", { &crc32c, std::chrono::nanoseconds(0), 0 } },
	{ "d", { &crc32d, std::chrono::nanoseconds(0), 0 } },
	{ "e", { &crc32e, std::chrono::nanoseconds(0), 0 } },
	{ "f", { &crc32f, std::chrono::nanoseconds(0), 0 } },
	{ "g", { &crc32g, std::chrono::nanoseconds(0), 0 } },
	{ "h", { &crc32h, std::chrono::nanoseconds(0), 0 } },
	};

	std::vector<char> data(data_size);
	std::mt19937 rng(std::random_device{}());
	std::uniform_int_distribution<> dist;
	std::vector<algo_map_type::iterator> not_run_algos;

	for (int i = 0; i < trials; ++i)
	{
	std::generate(data.begin(), data.end(), [&] () { return dist(rng); });

	// You can un-comment the ~8 byte variation to make sure things still work with unaligned vectors.
	const char* begin = data.data(); // + (dist(rng) % 8);
	const char* end = data.data() + data.size(); // - (dist(rng) % 8);

	for (auto iter = algos.begin(); iter != algos.end(); ++iter)
	not_run_algos.emplace_back(iter);
	std::shuffle(not_run_algos.begin(), not_run_algos.end(), rng);

	std::uint32_t known = 0;
	for ( ; !not_run_algos.empty(); not_run_algos.pop_back())
	{
	auto& runner = not_run_algos.back()->second;
	auto res = time([&] { return std::get<0>(runner)(begin, end); });
	std::get<1>(runner) += res.elapsed;
	std::get<2>(runner) += res.elapsed_ticks;

	if (known == 0)
	known = res.result;
	else if (known != res.result)
	throw std::runtime_error("Oh no!");
	}
	}

	auto nspb = [&] (std::chrono::nanoseconds x)
	{
	return std::chrono::duration_cast<std::chrono::duration<double, std::nano>>(x) / trials / data_size;
	};
	auto cycpb = [&] (std::int64_t x) { return double(x) / trials / data_size; };

	os << data_size;
	for (const auto& entry : algos)
	os << ',' << nspb(std::get<1>(entry.second)).count()
	<< ',' << cycpb(std::get<2>(entry.second))
	;
	os << std::endl;
	}

	int main()
	{
	std::cout << "size,Boost_ns,Boost_cyc,a_ns,a_cyc,b_ns,b_cyc,c_ns,c_cyc,d_ns,d_cyc,e_ns,e_cyc,f_ns,f_cyc,g_ns,g_cyc,h_ns,h_cyc" << std::endl;
	for (int shift = 8; shift <= 20; ++shift)
	run_test(std::cout, 1 << shift);
	}