ridiculousfish/divlu_benchmark.cpp

## divlu_benchmark.cpp
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#include <chrono>
#include <random>

__attribute__((noinline)) uint64_t divllu_v2_nobranch(
    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
    // We work in base 2**32.
    // A uint32 holds a single digit. A uint64 holds two digits.
    // Our numerator is conceptually [num3, num2, num1, num0].
    // Our denominator is [den1, den0].
    const uint64_t b = (1ull << 32);

    // The high and low digits of our computed quotient.
    uint32_t q1;
    uint32_t q0;

    // The normalization shift factor.
    int shift;

    // The high and low digits of our denominator (after normalizing).
    // Also the low 2 digits of our numerator (after normalizing).
    uint32_t den1;
    uint32_t den0;
    uint32_t num1;
    uint32_t num0;

    // A partial remainder.
    uint64_t rem;

    // The estimated quotient, and its corresponding remainder.
    uint64_t qhat;
    uint64_t rhat;

    // Variables used to correct the quotient and remainder.
    uint64_t remd1;
    uint64_t remd0;
    uint64_t qexcess1;
    uint64_t qexcess2;

    // Check for overflow and divide by 0.
    if (numhi >= den) {
        if (r != NULL) *r = ~0ull;
        return ~0ull;
    }

    // Determine the normalization factor. We multiply den by this, so that its leading digit is at
    // least half b. In binary this means just shifting left by the number of leading zeros, so that
    // there's a 1 in the MSB.
    // We also shift numer by the same amount. This cannot overflow because numhi < den.
    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
    // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
    // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
    shift = __builtin_clzll(den);
    den <<= shift;
    numhi <<= shift;
    numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
    numlo <<= shift;

    // Extract the low digits of the numerator and both digits of the denominator.
    num1 = (uint32_t)(numlo >> 32);
    num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
    den1 = (uint32_t)(den >> 32);
    den0 = (uint32_t)(den & 0xFFFFFFFFu);

    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
    // Estimate q1 as [n3 n2] / [d1], and then correct it.
    // Note while qhat may be 2 digits, q1 is always 1 digit.
    qhat = numhi / den1;
    rhat = numhi % den1;

    // Estimate the true remainder.
    remd1 = rhat * b + num1;
    remd0 = qhat * den0;
    rem = remd1 - remd0;

    // Correct both qhat and remainder.
    qexcess1 = -(rem > remd1);           // remd0 > remd1
    qexcess2 = qexcess1 & (-rem > den);  // remd0 - remd1 > den
    qhat += qexcess1;
    qhat -= qexcess2;
    rem += (den & qexcess1) << qexcess2;
    q1 = (uint32_t)qhat;

    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
    // Estimate q0 as [rem1 rem0] / [d1] and correct it.
    qhat = rem / den1;
    rhat = rem % den1;

    // Estimate the true remainder.
    remd1 = rhat * b + num0;
    remd0 = qhat * den0;
    rem = remd1 - remd0;

    qexcess1 = -(rem > remd1);            // remd0 > remd1
    qexcess2 = qexcess1 & -(-rem > den);  // remd0 - remd1 > den
    qhat += qexcess1 + qexcess2;
    q0 = (uint32_t)qhat;

    // Return remainder if requested.
    if (r != NULL) {
        rem += (den & qexcess1) + (den & qexcess2);
        *r = rem >> shift;
    }
    return ((uint64_t)q1 << 32) | q0;
}

__attribute__((noinline)) uint64_t divllu_v2_branch(
    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
    // We work in base 2**32.
    // A uint32 holds a single digit. A uint64 holds two digits.
    // Our numerator is conceptually [num3, num2, num1, num0].
    // Our denominator is [den1, den0].
    const uint64_t b = (1ull << 32);

    // The high and low digits of our computed quotient.
    uint32_t q1;
    uint32_t q0;

    // The normalization shift factor.
    int shift;

    // The high and low digits of our denominator (after normalizing).
    // Also the low 2 digits of our numerator (after normalizing).
    uint32_t den1;
    uint32_t den0;
    uint32_t num1;
    uint32_t num0;

    // A partial remainder.
    uint64_t rem;

    // The estimated quotient, and its corresponding remainder.
    uint64_t qhat;
    uint64_t rhat;

    // Variables used to correct the quotient and remainder.
    uint64_t remd1;
    uint64_t remd0;
    uint32_t qcorr;

    // Check for overflow and divide by 0.
    if (numhi >= den) {
        if (r != NULL) *r = ~0ull;
        return ~0ull;
    }

    // Determine the normalization factor. We multiply den by this, so that its leading digit is at
    // least half b. In binary this means just shifting left by the number of leading zeros, so that
    // there's a 1 in the MSB.
    // We also shift numer by the same amount. This cannot overflow because numhi < den.
    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
    // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
    // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
    shift = __builtin_clzll(den);
    den <<= shift;
    numhi <<= shift;
    numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
    numlo <<= shift;

    // Extract the low digits of the numerator and both digits of the denominator.
    num1 = (uint32_t)(numlo >> 32);
    num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
    den1 = (uint32_t)(den >> 32);
    den0 = (uint32_t)(den & 0xFFFFFFFFu);

    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
    // Estimate q1 as [n3 n2] / [d1], and then correct it.
    // Note while qhat may be 2 digits, q1 is always 1 digit.
    qhat = numhi / den1;
    rhat = numhi % den1;

    // Estimate the true remainder.
    remd1 = rhat * b + num1;
    remd0 = qhat * den0;
    rem = remd1 - remd0;

    // Correct both qhat and remainder.
    if (remd0 > remd1) {
        qcorr = (remd0 - remd1 > den);
        qhat -= (qcorr + 1);
        rem += den << qcorr;
    }
    q1 = (uint32_t)qhat;

    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
    // Estimate q0 as [rem1 rem0] / [d1] and correct it.
    qhat = rem / den1;
    rhat = rem % den1;

    // Estimate the true remainder.
    remd1 = rhat * b + num0;
    remd0 = qhat * den0;
    rem = remd1 - remd0;

    if (remd0 > remd1) {
        qcorr = (remd0 - remd1 > den);
        qhat -= (qcorr + 1);
        rem += den << qcorr;
    }
    q0 = (uint32_t)qhat;

    // Return remainder if requested.
    if (r != NULL) *r = rem >> shift;
    return ((uint64_t)q1 << 32) | q0;
}

__attribute__((noinline)) uint64_t divllu_mul(
    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
    // We work in base 2**32.
    // A uint32 holds a single digit. A uint64 holds two digits.
    // Our numerator is conceptually [num3, num2, num1, num0].
    // Our denominator is [den1, den0].
    const uint64_t b = (1ull << 32);

    // The high and low digits of our computed quotient.
    uint32_t q1;
    uint32_t q0;

    // The normalization shift factor.
    int shift;

    // The high and low digits of our denominator (after normalizing).
    // Also the low 2 digits of our numerator (after normalizing).
    uint32_t den1;
    uint32_t den0;
    uint32_t num1;
    uint32_t num0;

    // A partial remainder.
    uint64_t rem;

    // The estimated quotient, and its corresponding remainder (unrelated to true remainder).
    uint64_t qhat;
    uint64_t rhat;

    // Variables used to correct the estimated quotient.
    uint64_t c1;
    uint64_t c2;

    // Check for overflow and divide by 0.
    if (numhi >= den) {
        if (r != NULL) *r = ~0ull;
        return ~0ull;
    }

    // Determine the normalization factor. We multiply den by this, so that its leading digit is at
    // least half b. In binary this means just shifting left by the number of leading zeros, so that
    // there's a 1 in the MSB.
    // We also shift numer by the same amount. This cannot overflow because numhi < den.
    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
    // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
    // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
    shift = __builtin_clzll(den);
    den <<= shift;
    numhi <<= shift;
    numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
    numlo <<= shift;

    // Extract the low digits of the numerator and both digits of the denominator.
    num1 = (uint32_t)(numlo >> 32);
    num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
    den1 = (uint32_t)(den >> 32);
    den0 = (uint32_t)(den & 0xFFFFFFFFu);

    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
    // Estimate q1 as [n3 n2] / [d1], and then correct it.
    // Note while qhat may be 2 digits, q1 is always 1 digit.
    qhat = numhi / den1;
    rhat = numhi % den1;
    c1 = qhat * den0;
    c2 = rhat * b + num1;
    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
    q1 = (uint32_t)qhat;

    // Compute the true (partial) remainder.
    rem = numhi * b + num1 - q1 * den;

    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
    // Estimate q0 as [rem1 rem0] / [d1] and correct it.
    qhat = rem / den1;
    rhat = rem % den1;
    c1 = qhat * den0;
    c2 = rhat * b + num0;
    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
    q0 = (uint32_t)qhat;

    // Return remainder if requested.
    if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift;
    return ((uint64_t)q1 << 32) | q0;
}

__attribute__((noinline)) uint64_t divlu_orig(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
    const uint64_t b = (1ULL << 32);  // Number base (16 bits).
    uint64_t un1, un0,                // Norm. dividend LSD's.
        vn1, vn0,                     // Norm. divisor digits.
        q1, q0,                       // Quotient digits.
        un64, un21, un10,             // Dividend digit pairs.
        rhat;                         // A remainder.
    int s;                            // Shift amount for norm.

    if (u1 >= v) {                // If overflow, set rem.
        if (r != NULL)            // to an impossible value,
            *r = (uint64_t)(-1);  // and return the largest
        return (uint64_t)(-1);
    }  // possible quotient.

    /* count leading zeros */
    s = __builtin_clzll(v);  // 0 <= s <= 63.

    v = v << s;            // Normalize divisor.
    vn1 = v >> 32;         // Break divisor up into
    vn0 = v & 0xFFFFFFFF;  // two 32-bit digits.

    un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31));
    un10 = u0 << s;  // Shift dividend left.

    un1 = un10 >> 32;         // Break right half of
    un0 = un10 & 0xFFFFFFFF;  // dividend into two digits.

    q1 = un64 / vn1;         // Compute the first
    rhat = un64 - q1 * vn1;  // quotient digit, q1.
again1:
    if (q1 >= b || q1 * vn0 > b * rhat + un1) {
        q1 = q1 - 1;
        rhat = rhat + vn1;
        if (rhat < b) goto again1;
    }

    un21 = un64 * b + un1 - q1 * v;  // Multiply and subtract.

    q0 = un21 / vn1;         // Compute the second
    rhat = un21 - q0 * vn1;  // quotient digit, q0.
again2:
    if (q0 >= b || q0 * vn0 > b * rhat + un0) {
        q0 = q0 - 1;
        rhat = rhat + vn1;
        if (rhat < b) goto again2;
    }

    if (r != NULL)                            // If remainder is wanted,
        *r = (un21 * b + un0 - q0 * v) >> s;  // return it.
    return q1 * b + q0;
}

#if defined(__x86_64__)
__attribute__((noinline)) uint64_t divllu_asm(
    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
    uint64_t result;
    __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
    return result;
}
#endif

constexpr size_t CASE_COUNT = 1 << 14;
constexpr size_t ITER_COUNT = 1 << 10;
struct case_t {
    uint64_t numerhi;
    uint64_t numerlo;
    uint64_t denom;

    bool valid() const { return denom > 0 && numerhi < denom; }
};

static const case_t *make_cases() {
    std::mt19937_64 mt(std::mt19937_64::default_seed);
    case_t *cases = new case_t[CASE_COUNT];
    for (size_t i = 0; i < CASE_COUNT; i++) {
        do {
            cases[i].numerlo = mt();
            cases[i].numerhi = mt();
            cases[i].denom = mt();
        } while (!cases[i].valid());
    }
    return cases;
}

using divider_t = uint64_t (*)(uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r);

__attribute__((noinline)) static uint64_t time_function(const case_t *cases, divider_t div) {
    uint64_t sum = 0;
    for (size_t i = 0; i < CASE_COUNT; i++) {
        uint64_t quot;
        uint64_t rem;
        quot = div(cases[i].numerhi, cases[i].numerlo, cases[i].denom, &rem);
        sum += quot;
        sum += rem;
    }
    return sum;
}

int main(int argc, char *argv[]) {
    int onlycase = 0;
    if (argv[1]) onlycase = atoi(argv[1]);
    const struct {
        const char *name;
        divider_t func;
    } dividers[] = {
        {"hackers", divlu_orig},
        {"libdiv mul", divllu_mul},
        {"libdiv sub", divllu_v2_branch},
        {"libdiv sub bf", divllu_v2_branch},
#if defined(__x86_64__)
        {"divq", divllu_asm},
#endif
    };
    const case_t *cases = make_cases();
    using namespace std::chrono;
    const uint64_t exp = time_function(cases, divlu_orig);
    int whichcase = 0;
    for (const auto &d : dividers) {
        whichcase++;
        if (onlycase && whichcase != onlycase) continue;
        uint64_t best = std::numeric_limits<uint64_t>::max();
        for (size_t i = 0; i < ITER_COUNT; i++) {
            using namespace std::chrono;
            auto t1 = high_resolution_clock::now();
            uint64_t res = time_function(cases, d.func);
            auto t2 = high_resolution_clock::now();
            uint64_t nanos = duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
            best = std::min(best, nanos);
            if (res != exp) abort();
        }
        double nsec = (double)best / (double)CASE_COUNT;
        printf("%18s\t%4.1f\n", d.name, nsec);
    }
    return 0;
}
	#include <limits.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>

	#include <chrono>
	#include <random>

	__attribute__((noinline)) uint64_t divllu_v2_nobranch(
	uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
	// We work in base 2**32.
	// A uint32 holds a single digit. A uint64 holds two digits.
	// Our numerator is conceptually [num3, num2, num1, num0].
	// Our denominator is [den1, den0].
	const uint64_t b = (1ull << 32);

	// The high and low digits of our computed quotient.
	uint32_t q1;
	uint32_t q0;

	// The normalization shift factor.
	int shift;

	// The high and low digits of our denominator (after normalizing).
	// Also the low 2 digits of our numerator (after normalizing).
	uint32_t den1;
	uint32_t den0;
	uint32_t num1;
	uint32_t num0;

	// A partial remainder.
	uint64_t rem;

	// The estimated quotient, and its corresponding remainder.
	uint64_t qhat;
	uint64_t rhat;

	// Variables used to correct the quotient and remainder.
	uint64_t remd1;
	uint64_t remd0;
	uint64_t qexcess1;
	uint64_t qexcess2;

	// Check for overflow and divide by 0.
	if (numhi >= den) {
	if (r != NULL) *r = ~0ull;
	return ~0ull;
	}

	// Determine the normalization factor. We multiply den by this, so that its leading digit is at
	// least half b. In binary this means just shifting left by the number of leading zeros, so that
	// there's a 1 in the MSB.
	// We also shift numer by the same amount. This cannot overflow because numhi < den.
	// The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
	// by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
	// 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
	shift = __builtin_clzll(den);
	den <<= shift;
	numhi <<= shift;
	numhi \|= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
	numlo <<= shift;

	// Extract the low digits of the numerator and both digits of the denominator.
	num1 = (uint32_t)(numlo >> 32);
	num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
	den1 = (uint32_t)(den >> 32);
	den0 = (uint32_t)(den & 0xFFFFFFFFu);

	// We wish to compute q1 = [n3 n2 n1] / [d1 d0].
	// Estimate q1 as [n3 n2] / [d1], and then correct it.
	// Note while qhat may be 2 digits, q1 is always 1 digit.
	qhat = numhi / den1;
	rhat = numhi % den1;

	// Estimate the true remainder.
	remd1 = rhat * b + num1;
	remd0 = qhat * den0;
	rem = remd1 - remd0;

	// Correct both qhat and remainder.
	qexcess1 = -(rem > remd1); // remd0 > remd1
	qexcess2 = qexcess1 & (-rem > den); // remd0 - remd1 > den
	qhat += qexcess1;
	qhat -= qexcess2;
	rem += (den & qexcess1) << qexcess2;
	q1 = (uint32_t)qhat;

	// We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
	// Estimate q0 as [rem1 rem0] / [d1] and correct it.
	qhat = rem / den1;
	rhat = rem % den1;

	// Estimate the true remainder.
	remd1 = rhat * b + num0;
	remd0 = qhat * den0;
	rem = remd1 - remd0;

	qexcess1 = -(rem > remd1); // remd0 > remd1
	qexcess2 = qexcess1 & -(-rem > den); // remd0 - remd1 > den
	qhat += qexcess1 + qexcess2;
	q0 = (uint32_t)qhat;

	// Return remainder if requested.
	if (r != NULL) {
	rem += (den & qexcess1) + (den & qexcess2);
	*r = rem >> shift;
	}
	return ((uint64_t)q1 << 32) \| q0;
	}

	__attribute__((noinline)) uint64_t divllu_v2_branch(
	uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
	// We work in base 2**32.
	// A uint32 holds a single digit. A uint64 holds two digits.
	// Our numerator is conceptually [num3, num2, num1, num0].
	// Our denominator is [den1, den0].
	const uint64_t b = (1ull << 32);

	// The high and low digits of our computed quotient.
	uint32_t q1;
	uint32_t q0;

	// The normalization shift factor.
	int shift;

	// The high and low digits of our denominator (after normalizing).
	// Also the low 2 digits of our numerator (after normalizing).
	uint32_t den1;
	uint32_t den0;
	uint32_t num1;
	uint32_t num0;

	// A partial remainder.
	uint64_t rem;

	// The estimated quotient, and its corresponding remainder.
	uint64_t qhat;
	uint64_t rhat;

	// Variables used to correct the quotient and remainder.
	uint64_t remd1;
	uint64_t remd0;
	uint32_t qcorr;

	// Check for overflow and divide by 0.
	if (numhi >= den) {
	if (r != NULL) *r = ~0ull;
	return ~0ull;
	}

	// Determine the normalization factor. We multiply den by this, so that its leading digit is at
	// least half b. In binary this means just shifting left by the number of leading zeros, so that
	// there's a 1 in the MSB.
	// We also shift numer by the same amount. This cannot overflow because numhi < den.
	// The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
	// by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
	// 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
	shift = __builtin_clzll(den);
	den <<= shift;
	numhi <<= shift;
	numhi \|= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
	numlo <<= shift;

	// Extract the low digits of the numerator and both digits of the denominator.
	num1 = (uint32_t)(numlo >> 32);
	num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
	den1 = (uint32_t)(den >> 32);
	den0 = (uint32_t)(den & 0xFFFFFFFFu);

	// We wish to compute q1 = [n3 n2 n1] / [d1 d0].
	// Estimate q1 as [n3 n2] / [d1], and then correct it.
	// Note while qhat may be 2 digits, q1 is always 1 digit.
	qhat = numhi / den1;
	rhat = numhi % den1;

	// Estimate the true remainder.
	remd1 = rhat * b + num1;
	remd0 = qhat * den0;
	rem = remd1 - remd0;

	// Correct both qhat and remainder.
	if (remd0 > remd1) {
	qcorr = (remd0 - remd1 > den);
	qhat -= (qcorr + 1);
	rem += den << qcorr;
	}
	q1 = (uint32_t)qhat;

	// We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
	// Estimate q0 as [rem1 rem0] / [d1] and correct it.
	qhat = rem / den1;
	rhat = rem % den1;

	// Estimate the true remainder.
	remd1 = rhat * b + num0;
	remd0 = qhat * den0;
	rem = remd1 - remd0;

	if (remd0 > remd1) {
	qcorr = (remd0 - remd1 > den);
	qhat -= (qcorr + 1);
	rem += den << qcorr;
	}
	q0 = (uint32_t)qhat;

	// Return remainder if requested.
	if (r != NULL) *r = rem >> shift;
	return ((uint64_t)q1 << 32) \| q0;
	}

	__attribute__((noinline)) uint64_t divllu_mul(
	uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
	// We work in base 2**32.
	// A uint32 holds a single digit. A uint64 holds two digits.
	// Our numerator is conceptually [num3, num2, num1, num0].
	// Our denominator is [den1, den0].
	const uint64_t b = (1ull << 32);

	// The high and low digits of our computed quotient.
	uint32_t q1;
	uint32_t q0;

	// The normalization shift factor.
	int shift;

	// The high and low digits of our denominator (after normalizing).
	// Also the low 2 digits of our numerator (after normalizing).
	uint32_t den1;
	uint32_t den0;
	uint32_t num1;
	uint32_t num0;

	// A partial remainder.
	uint64_t rem;

	// The estimated quotient, and its corresponding remainder (unrelated to true remainder).
	uint64_t qhat;
	uint64_t rhat;

	// Variables used to correct the estimated quotient.
	uint64_t c1;
	uint64_t c2;

	// Check for overflow and divide by 0.
	if (numhi >= den) {
	if (r != NULL) *r = ~0ull;
	return ~0ull;
	}

	// Determine the normalization factor. We multiply den by this, so that its leading digit is at
	// least half b. In binary this means just shifting left by the number of leading zeros, so that
	// there's a 1 in the MSB.
	// We also shift numer by the same amount. This cannot overflow because numhi < den.
	// The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
	// by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
	// 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
	shift = __builtin_clzll(den);
	den <<= shift;
	numhi <<= shift;
	numhi \|= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
	numlo <<= shift;

	// Extract the low digits of the numerator and both digits of the denominator.
	num1 = (uint32_t)(numlo >> 32);
	num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
	den1 = (uint32_t)(den >> 32);
	den0 = (uint32_t)(den & 0xFFFFFFFFu);

	// We wish to compute q1 = [n3 n2 n1] / [d1 d0].
	// Estimate q1 as [n3 n2] / [d1], and then correct it.
	// Note while qhat may be 2 digits, q1 is always 1 digit.
	qhat = numhi / den1;
	rhat = numhi % den1;
	c1 = qhat * den0;
	c2 = rhat * b + num1;
	if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
	q1 = (uint32_t)qhat;

	// Compute the true (partial) remainder.
	rem = numhi * b + num1 - q1 * den;

	// We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
	// Estimate q0 as [rem1 rem0] / [d1] and correct it.
	qhat = rem / den1;
	rhat = rem % den1;
	c1 = qhat * den0;
	c2 = rhat * b + num0;
	if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
	q0 = (uint32_t)qhat;

	// Return remainder if requested.
	if (r != NULL) r = (rem b + num0 - q0 * den) >> shift;
	return ((uint64_t)q1 << 32) \| q0;
	}

	__attribute__((noinline)) uint64_t divlu_orig(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
	const uint64_t b = (1ULL << 32); // Number base (16 bits).
	uint64_t un1, un0, // Norm. dividend LSD's.
	vn1, vn0, // Norm. divisor digits.
	q1, q0, // Quotient digits.
	un64, un21, un10, // Dividend digit pairs.
	rhat; // A remainder.
	int s; // Shift amount for norm.

	if (u1 >= v) { // If overflow, set rem.
	if (r != NULL) // to an impossible value,
	*r = (uint64_t)(-1); // and return the largest
	return (uint64_t)(-1);
	} // possible quotient.

	/* count leading zeros */
	s = __builtin_clzll(v); // 0 <= s <= 63.

	v = v << s; // Normalize divisor.
	vn1 = v >> 32; // Break divisor up into
	vn0 = v & 0xFFFFFFFF; // two 32-bit digits.

	un64 = (u1 << s) \| ((u0 >> (64 - s)) & (-s >> 31));
	un10 = u0 << s; // Shift dividend left.

	un1 = un10 >> 32; // Break right half of
	un0 = un10 & 0xFFFFFFFF; // dividend into two digits.

	q1 = un64 / vn1; // Compute the first
	rhat = un64 - q1 * vn1; // quotient digit, q1.
	again1:
	if (q1 >= b \|\| q1 * vn0 > b * rhat + un1) {
	q1 = q1 - 1;
	rhat = rhat + vn1;
	if (rhat < b) goto again1;
	}

	un21 = un64 * b + un1 - q1 * v; // Multiply and subtract.

	q0 = un21 / vn1; // Compute the second
	rhat = un21 - q0 * vn1; // quotient digit, q0.
	again2:
	if (q0 >= b \|\| q0 * vn0 > b * rhat + un0) {
	q0 = q0 - 1;
	rhat = rhat + vn1;
	if (rhat < b) goto again2;
	}

	if (r != NULL) // If remainder is wanted,
	r = (un21 b + un0 - q0 * v) >> s; // return it.
	return q1 * b + q0;
	}

	#if defined(__x86_64__)
	__attribute__((noinline)) uint64_t divllu_asm(
	uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
	uint64_t result;
	__asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
	return result;
	}
	#endif

	constexpr size_t CASE_COUNT = 1 << 14;
	constexpr size_t ITER_COUNT = 1 << 10;
	struct case_t {
	uint64_t numerhi;
	uint64_t numerlo;
	uint64_t denom;

	bool valid() const { return denom > 0 && numerhi < denom; }
	};

	static const case_t *make_cases() {
	std::mt19937_64 mt(std::mt19937_64::default_seed);
	case_t *cases = new case_t[CASE_COUNT];
	for (size_t i = 0; i < CASE_COUNT; i++) {
	do {
	cases[i].numerlo = mt();
	cases[i].numerhi = mt();
	cases[i].denom = mt();
	} while (!cases[i].valid());
	}
	return cases;
	}

	using divider_t = uint64_t ()(uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t r);

	__attribute__((noinline)) static uint64_t time_function(const case_t *cases, divider_t div) {
	uint64_t sum = 0;
	for (size_t i = 0; i < CASE_COUNT; i++) {
	uint64_t quot;
	uint64_t rem;
	quot = div(cases[i].numerhi, cases[i].numerlo, cases[i].denom, &rem);
	sum += quot;
	sum += rem;
	}
	return sum;
	}

	int main(int argc, char *argv[]) {
	int onlycase = 0;
	if (argv[1]) onlycase = atoi(argv[1]);
	const struct {
	const char *name;
	divider_t func;
	} dividers[] = {
	{"hackers", divlu_orig},
	{"libdiv mul", divllu_mul},
	{"libdiv sub", divllu_v2_branch},
	{"libdiv sub bf", divllu_v2_branch},
	#if defined(__x86_64__)
	{"divq", divllu_asm},
	#endif
	};
	const case_t *cases = make_cases();
	using namespace std::chrono;
	const uint64_t exp = time_function(cases, divlu_orig);
	int whichcase = 0;
	for (const auto &d : dividers) {
	whichcase++;
	if (onlycase && whichcase != onlycase) continue;
	uint64_t best = std::numeric_limits<uint64_t>::max();
	for (size_t i = 0; i < ITER_COUNT; i++) {
	using namespace std::chrono;
	auto t1 = high_resolution_clock::now();
	uint64_t res = time_function(cases, d.func);
	auto t2 = high_resolution_clock::now();
	uint64_t nanos = duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
	best = std::min(best, nanos);
	if (res != exp) abort();
	}
	double nsec = (double)best / (double)CASE_COUNT;
	printf("%18s\t%4.1f\n", d.name, nsec);
	}
	return 0;
	}