danlark1/benchmark_int128.cc

## benchmark_int128.cc
// Installation.
// Abseil, compile with -labseil
// Libdivide as header.
// GMP as header, compile with -lgmp
// Change the private constructor of absl::uin128 to public in order to use public inheritance.
// GMOCK, GUNIT, compile it.
#include "absl/numeric/int128.h"

#include <random>

#include "libdivide.h"
#include "benchmark/benchmark.h"
#include "base/public/gmock.h"
#include "base/public/gunit.h"
#include "gmp.h"

typedef unsigned tu_int __attribute__((mode(TI)));
typedef int ti_int __attribute__((mode(TI)));
typedef uint64_t du_int;
typedef uint32_t su_int;

typedef union {
  tu_int all;
  struct {
    du_int low;
    du_int high;
  } s;
} utwords;

tu_int LLVM_divmod(tu_int a, tu_int b, tu_int* rem) {
  const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT;
  const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
  utwords n;
  n.all = a;
  utwords d;
  d.all = b;
  utwords q;
  utwords r;
  unsigned sr;
  // special cases, X is unknown, K != 0
  if (n.s.high == 0) {
    if (d.s.high == 0) {
      // 0 X
      // ---
      // 0 X
      if (rem) *rem = n.s.low % d.s.low;
      return n.s.low / d.s.low;
    }
    // 0 X
    // ---
    // K X
    if (rem) *rem = n.s.low;
    return 0;
  }
  // n.s.high != 0
  if (d.s.low == 0) {
    if (d.s.high == 0) {
      // K X
      // ---
      // 0 0
      if (rem) *rem = n.s.high % d.s.low;
      return n.s.high / d.s.low;
    }
    // d.s.high != 0
    if (n.s.low == 0) {
      // K 0
      // ---
      // K 0
      if (rem) {
        r.s.high = n.s.high % d.s.high;
        r.s.low = 0;
        *rem = r.all;
      }
      return n.s.high / d.s.high;
    }
    // K K
    // ---
    // K 0
    if ((d.s.high & (d.s.high - 1)) == 0) /* if d is a power of 2 */ {
      if (rem) {
        r.s.low = n.s.low;
        r.s.high = n.s.high & (d.s.high - 1);
        *rem = r.all;
      }
      return n.s.high >> __builtin_ctzll(d.s.high);
    }
    // K K
    // ---
    // K 0
    sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high);
    // 0 <= sr <= n_udword_bits - 2 or sr large
    if (sr > n_udword_bits - 2) {
      if (rem) *rem = n.all;
      return 0;
    }
    ++sr;
    // 1 <= sr <= n_udword_bits - 1
    // q.all = n.all << (n_utword_bits - sr);
    q.s.low = 0;
    q.s.high = n.s.low << (n_udword_bits - sr);
    // r.all = n.all >> sr;
    r.s.high = n.s.high >> sr;
    r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr);
  } else /* d.s.low != 0 */ {
    if (d.s.high == 0) {
      // K X
      // ---
      // 0 K
      if ((d.s.low & (d.s.low - 1)) == 0) /* if d is a power of 2 */ {
        if (rem) *rem = n.s.low & (d.s.low - 1);
        if (d.s.low == 1) return n.all;
        sr = __builtin_ctzll(d.s.low);
        q.s.high = n.s.high >> sr;
        q.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr);
        return q.all;
      }
      // K X
      // ---
      // 0 K
      sr = 1 + n_udword_bits + __builtin_clzll(d.s.low) -
           __builtin_clzll(n.s.high);
      // 2 <= sr <= n_utword_bits - 1
      // q.all = n.all << (n_utword_bits - sr);
      // r.all = n.all >> sr;
      if (sr == n_udword_bits) {
        q.s.low = 0;
        q.s.high = n.s.low;
        r.s.high = 0;
        r.s.low = n.s.high;
      } else if (sr < n_udword_bits) /* 2 <= sr <= n_udword_bits - 1 */ {
        q.s.low = 0;
        q.s.high = n.s.low << (n_udword_bits - sr);
        r.s.high = n.s.high >> sr;
        r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr);
      } else /* n_udword_bits + 1 <= sr <= n_utword_bits - 1 */ {
        q.s.low = n.s.low << (n_utword_bits - sr);
        q.s.high = (n.s.high << (n_utword_bits - sr)) |
                   (n.s.low >> (sr - n_udword_bits));
        r.s.high = 0;
        r.s.low = n.s.high >> (sr - n_udword_bits);
      }
    } else {
      // K X
      // ---
      // K K
      sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high);
      // 0 <= sr <= n_udword_bits - 1 or sr large
      if (sr > n_udword_bits - 1) {
        if (rem) *rem = n.all;
        return 0;
      }
      ++sr;
      // 1 <= sr <= n_udword_bits
      // q.all = n.all << (n_utword_bits - sr);
      // r.all = n.all >> sr;
      q.s.low = 0;
      if (sr == n_udword_bits) {
        q.s.high = n.s.low;
        r.s.high = 0;
        r.s.low = n.s.high;
      } else {
        r.s.high = n.s.high >> sr;
        r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr);
        q.s.high = n.s.low << (n_udword_bits - sr);
      }
    }
  }
  // Not a special case
  // q and r are initialized with:
  // q.all = n.all << (n_utword_bits - sr);
  // r.all = n.all >> sr;
  // 1 <= sr <= n_utword_bits - 1
  su_int carry = 0;
  for (; sr > 0; --sr) {
    // r:q = ((r:q)  << 1) | carry
    r.s.high = (r.s.high << 1) | (r.s.low >> (n_udword_bits - 1));
    r.s.low = (r.s.low << 1) | (q.s.high >> (n_udword_bits - 1));
    q.s.high = (q.s.high << 1) | (q.s.low >> (n_udword_bits - 1));
    q.s.low = (q.s.low << 1) | carry;
    // carry = 0;
    // if (r.all >= d.all)
    // {
    //     r.all -= d.all;
    //      carry = 1;
    // }
    const ti_int s = (ti_int)(d.all - r.all - 1) >> (n_utword_bits - 1);
    carry = s & 1;
    r.all -= d.all & s;
  }
  q.all = (q.all << 1) | carry;
  if (rem) *rem = r.all;
  return q.all;
}

inline du_int Divide128div64to64Primitive(du_int u1, du_int u0, du_int v,
                                          du_int* r) {
  // Code taken from Hacker's Delight:
  // http://www.hackersdelight.org/HDcode/divlu.c.
  // License permits inclusion here per:
  // http://www.hackersdelight.org/permissions.htm

  const uint64_t b = (1ULL << 32);  // Number base (32 bits)
  uint64_t un1, un0;                // Norm. dividend LSD's
  uint64_t vn1, vn0;                // Norm. divisor digits
  uint64_t q1, q0;                  // Quotient digits
  uint64_t un64, un21, un10;        // Dividend digit pairs
  uint64_t rhat;                    // A remainder
  int32_t s;                        // Shift amount for norm

  // If overflow, set rem. to an impossible value,
  // and return the largest possible quotient
  if (u1 >= v) {
    *r = (uint64_t)-1;
    return (uint64_t)-1;
  }

  // count leading zeros
  s = __builtin_clzll(v);
  if (s > 0) {
    // Normalize divisor
    v = v << s;
    un64 = (u1 << s) | (u0 >> (64 - s));
    un10 = u0 << s;  // Shift dividend left
  } else {
    // Avoid undefined behavior of (u0 >> 64).
    // The behavior is undefined if the right operand is
    // negative, or greater than or equal to the length
    // in bits of the promoted left operand.
    un64 = u1;
    un10 = u0;
  }

  // Break divisor up into two 32-bit digits
  vn1 = v >> 32;
  vn0 = v & 0xFFFFFFFF;

  // Break right half of dividend into two digits
  un1 = un10 >> 32;
  un0 = un10 & 0xFFFFFFFF;

  // Compute the first quotient digit, q1
  q1 = un64 / vn1;
  rhat = un64 - q1 * vn1;

  while (q1 >= b || q1 * vn0 > b * rhat + un1) {
    q1 = q1 - 1;
    rhat = rhat + vn1;
    if (rhat >= b) break;
  }

  // Multiply and subtract
  un21 = un64 * b + un1 - q1 * v;

  // Compute the second quotient digit
  q0 = un21 / vn1;
  rhat = un21 - q0 * vn1;

  while (q0 >= b || q0 * vn0 > b * rhat + un0) {
    q0 = q0 - 1;
    rhat = rhat + vn1;
    if (rhat >= b) break;
  }

  *r = (un21 * b + un0 - q0 * v) >> s;
  return q1 * b + q0;
}

inline du_int Divide128div64to64(du_int u1, du_int u0, du_int v, du_int* r) {
#if defined(__x86_64__)
  du_int result;
  __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
  return result;
#else
  return Divide128div64to64Primitive(u1, u0, v, r);
#endif
}

struct OptimizedDiv {
  du_int operator()(du_int u1, du_int u0, du_int v, du_int* r) const {
    return Divide128div64to64(u1, u0, v, r);
  }
};

struct PrimitiveDiv {
  du_int operator()(du_int u1, du_int u0, du_int v, du_int* r) const {
    return Divide128div64to64Primitive(u1, u0, v, r);
  }
};

struct GmpDiv {
  du_int operator()(du_int u1, du_int u0, du_int v, du_int* r) const {
    mp_limb_t q[2] = {u0, u1};
    mp_limb_t result[2] = {0, 0};
    *r = mpn_divrem_1(result, 0, q, 2, v);
    return result[0];
  }
};

template <typename Div128by64>
tu_int MyDivMod1(tu_int a, tu_int b, tu_int* rem) {
  const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
  (void)n_utword_bits;
  utwords dividend;
  dividend.all = a;
  utwords divisor;
  divisor.all = b;
  utwords quotient;
  utwords remainder;
  if (divisor.s.high == 0) {
    remainder.s.high = 0;
    if (dividend.s.high < divisor.s.low) {
      quotient.s.low = Div128by64{}(dividend.s.high, dividend.s.low,
                                    divisor.s.low, &remainder.s.low);
      quotient.s.high = 0;
    } else {
      quotient.s.high =
          Div128by64{}(0, dividend.s.high, divisor.s.low, &dividend.s.high);
      quotient.s.low = Div128by64{}(dividend.s.high, dividend.s.low,
                                    divisor.s.low, &remainder.s.low);
    }
    if (rem) *rem = remainder.all;
    return quotient.all;
  }
  int shift =
      __builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high);
  divisor.all <<= shift;
  quotient.s.high = 0;
  quotient.s.low = 0;
  for (; shift >= 0; --shift) {
    quotient.s.low <<= 1;
    // const ti_int s = (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1);
    // quotient.s.low |= s & 1;
    if (dividend.all >= divisor.all) {
      dividend.all -= divisor.all;
      quotient.s.low |= 1;
    }
    // dividend.all -= divisor.all & s;
    divisor.all >>= 1;
  }
  if (rem) *rem = dividend.all;
  return quotient.all;
}

template <typename Div128by64>
tu_int MyDivMod2(tu_int a, tu_int b, tu_int* rem) {
  utwords dividend;
  dividend.all = a;
  utwords divisor;
  divisor.all = b;
  utwords quotient;
  utwords remainder;
  if (divisor.all > dividend.all) {
    if (rem) *rem = dividend.all;
    return 0;
  }
  if (divisor.s.high == 0) {
    remainder.s.high = 0;
    if (dividend.s.high < divisor.s.low) {
      quotient.s.low = Div128by64{}(dividend.s.high, dividend.s.low,
                                    divisor.s.low, &remainder.s.low);
      quotient.s.high = 0;
    } else {
      quotient.s.high =
          Div128by64{}(0, dividend.s.high, divisor.s.low, &dividend.s.high);
      quotient.s.low = Div128by64{}(dividend.s.high, dividend.s.low,
                                    divisor.s.low, &remainder.s.low);
    }
    if (rem) *rem = remainder.all;
    return quotient.all;
  }
  quotient.s.high = 0;
  quotient.s.low = libdivide::libdivide_128_div_128_to_64(
      dividend.s.high, dividend.s.low, divisor.s.high, divisor.s.low,
      &remainder.s.high, &remainder.s.low);
  if (rem) {
    *rem = remainder.all;
  }
  return quotient.all;
}

struct DivisionImpl : public absl::uint128 {
  using Base = absl::uint128;
  using Base::Base;
  DivisionImpl(absl::uint128 u) : Base(u) {}
};

struct LLVMDivision : public absl::uint128 {
  using Base = absl::uint128;
  using Base::Base;
  LLVMDivision(absl::uint128 u) : Base(u) {}
};

absl::uint128 operator/(const LLVMDivision& lhs, const LLVMDivision& rhs) {
  utwords lhs_u;
  utwords rhs_u;
  lhs_u.s.low = absl::Uint128Low64(lhs);
  lhs_u.s.high = absl::Uint128High64(lhs);
  rhs_u.s.low = absl::Uint128Low64(rhs);
  rhs_u.s.high = absl::Uint128High64(rhs);
  utwords n;
  n.all = LLVM_divmod(lhs_u.all, rhs_u.all, nullptr);
  return absl::MakeUint128(n.s.high, n.s.low);
}

absl::uint128 operator%(const LLVMDivision& lhs, const LLVMDivision& rhs) {
  utwords lhs_u;
  utwords rhs_u;
  lhs_u.s.low = absl::Uint128Low64(lhs);
  lhs_u.s.high = absl::Uint128High64(lhs);
  rhs_u.s.low = absl::Uint128Low64(rhs);
  rhs_u.s.high = absl::Uint128High64(rhs);
  utwords rem;
  LLVM_divmod(lhs_u.all, rhs_u.all, &rem.all);
  return absl::MakeUint128(rem.s.high, rem.s.low);
}

struct MyDivision1 : public absl::uint128 {
  using Base = absl::uint128;
  using Base::Base;
  MyDivision1(absl::uint128 u) : Base(u) {}
};

absl::uint128 operator/(const MyDivision1& lhs, const MyDivision1& rhs) {
  utwords lhs_u;
  utwords rhs_u;
  lhs_u.s.low = absl::Uint128Low64(lhs);
  lhs_u.s.high = absl::Uint128High64(lhs);
  rhs_u.s.low = absl::Uint128Low64(rhs);
  rhs_u.s.high = absl::Uint128High64(rhs);
  utwords n;
  n.all = MyDivMod1<OptimizedDiv>(lhs_u.all, rhs_u.all, nullptr);
  return absl::MakeUint128(n.s.high, n.s.low);
}

absl::uint128 operator%(const MyDivision1& lhs, const MyDivision1& rhs) {
  utwords lhs_u;
  utwords rhs_u;
  lhs_u.s.low = absl::Uint128Low64(lhs);
  lhs_u.s.high = absl::Uint128High64(lhs);
  rhs_u.s.low = absl::Uint128Low64(rhs);
  rhs_u.s.high = absl::Uint128High64(rhs);
  utwords rem;
  MyDivMod1<OptimizedDiv>(lhs_u.all, rhs_u.all, &rem.all);
  return absl::MakeUint128(rem.s.high, rem.s.low);
}

struct MyDivision2 : public absl::uint128 {
  using Base = absl::uint128;
  using Base::Base;
  MyDivision2(absl::uint128 u) : Base(u) {}
};

absl::uint128 operator/(const MyDivision2& lhs, const MyDivision2& rhs) {
  utwords lhs_u;
  utwords rhs_u;
  lhs_u.s.low = absl::Uint128Low64(lhs);
  lhs_u.s.high = absl::Uint128High64(lhs);
  rhs_u.s.low = absl::Uint128Low64(rhs);
  rhs_u.s.high = absl::Uint128High64(rhs);
  utwords n;
  n.all = MyDivMod1<PrimitiveDiv>(lhs_u.all, rhs_u.all, nullptr);
  return absl::MakeUint128(n.s.high, n.s.low);
}

absl::uint128 operator%(const MyDivision2& lhs, const MyDivision2& rhs) {
  utwords lhs_u;
  utwords rhs_u;
  lhs_u.s.low = absl::Uint128Low64(lhs);
  lhs_u.s.high = absl::Uint128High64(lhs);
  rhs_u.s.low = absl::Uint128Low64(rhs);
  rhs_u.s.high = absl::Uint128High64(rhs);
  utwords rem;
  MyDivMod1<PrimitiveDiv>(lhs_u.all, rhs_u.all, &rem.all);
  return absl::MakeUint128(rem.s.high, rem.s.low);
}

struct GmpDivision : public absl::uint128 {
  using Base = absl::uint128;
  using Base::Base;
  GmpDivision(absl::uint128 u) : Base(u) {}
};

absl::uint128 operator/(const GmpDivision& lhs, const GmpDivision& rhs) {
  utwords lhs_u;
  utwords rhs_u;
  lhs_u.s.low = absl::Uint128Low64(lhs);
  lhs_u.s.high = absl::Uint128High64(lhs);
  rhs_u.s.low = absl::Uint128Low64(rhs);
  rhs_u.s.high = absl::Uint128High64(rhs);
  utwords n;
  n.all = MyDivMod1<GmpDiv>(lhs_u.all, rhs_u.all, nullptr);
  return absl::MakeUint128(n.s.high, n.s.low);
}

absl::uint128 operator%(const GmpDivision& lhs, const GmpDivision& rhs) {
  utwords lhs_u;
  utwords rhs_u;
  lhs_u.s.low = absl::Uint128Low64(lhs);
  lhs_u.s.high = absl::Uint128High64(lhs);
  rhs_u.s.low = absl::Uint128Low64(rhs);
  rhs_u.s.high = absl::Uint128High64(rhs);
  utwords rem;
  MyDivMod1<GmpDiv>(lhs_u.all, rhs_u.all, &rem.all);
  return absl::MakeUint128(rem.s.high, rem.s.low);
}


struct LibDivideDivision : public absl::uint128 {
  using Base = absl::uint128;
  using Base::Base;
  LibDivideDivision(absl::uint128 u) : Base(u) {}
};

absl::uint128 operator/(const LibDivideDivision& lhs, const LibDivideDivision& rhs) {
  utwords lhs_u;
  utwords rhs_u;
  lhs_u.s.low = absl::Uint128Low64(lhs);
  lhs_u.s.high = absl::Uint128High64(lhs);
  rhs_u.s.low = absl::Uint128Low64(rhs);
  rhs_u.s.high = absl::Uint128High64(rhs);
  utwords n;
  n.all = MyDivMod2<OptimizedDiv>(lhs_u.all, rhs_u.all, nullptr);
  return absl::MakeUint128(n.s.high, n.s.low);
}

absl::uint128 operator%(const LibDivideDivision& lhs, const LibDivideDivision& rhs) {
  utwords lhs_u;
  utwords rhs_u;
  lhs_u.s.low = absl::Uint128Low64(lhs);
  lhs_u.s.high = absl::Uint128High64(lhs);
  rhs_u.s.low = absl::Uint128Low64(rhs);
  rhs_u.s.high = absl::Uint128High64(rhs);
  utwords rem;
  MyDivMod2<OptimizedDiv>(lhs_u.all, rhs_u.all, &rem.all);
  return absl::MakeUint128(rem.s.high, rem.s.low);
}

namespace {

template <typename T>
void TestDivision() {
  using std::swap;
  // a := q * b + r
  T a, b, q, r;

  // Zero test.
  a = 0;
  b = 123;
  q = a / b;
  r = a % b;
  EXPECT_EQ(0, q);
  EXPECT_EQ(0, r);

  a = T(0x530eda741c71d4c3, 0xbf25975319080000);
  q = T(0x4de2cab081, 0x14c34ab4676e4bab);
  b = T(0x1110001);
  r = T(0x3eb455);
  ASSERT_EQ(a, q * b + r);  // Sanity-check.

  T result_q, result_r;
  result_q = a / b;
  result_r = a % b;
  EXPECT_EQ(q, result_q);
  EXPECT_EQ(r, result_r);

  // Try the other way around.
  swap(q, b);
  result_q = a / b;
  result_r = a % b;
  EXPECT_EQ(q, result_q);
  EXPECT_EQ(r, result_r);
  // Restore.
  swap(b, q);

  // Dividend < divisor; result should be q:0 r:<dividend>.
  swap(a, b);
  result_q = a / b;
  result_r = a % b;
  EXPECT_EQ(0, result_q);
  EXPECT_EQ(a, result_r);
  // Try the other way around.
  swap(a, q);
  result_q = a / b;
  result_r = a % b;
  EXPECT_EQ(0, result_q);
  EXPECT_EQ(a, result_r);
  // Restore.
  swap(q, a);
  swap(b, a);

  // Try a large remainder.
  b = a / 2 + 1;
  T expected_r = T(0x29876d3a0e38ea61, 0xdf92cba98c83ffff);
  // Sanity checks.
  ASSERT_EQ(a / 2 - 1, expected_r);
  ASSERT_EQ(a, b + expected_r);
  result_q = a / b;
  result_r = a % b;
  EXPECT_EQ(1, result_q);
  EXPECT_EQ(expected_r, result_r);
}

template <typename T>
void TestRandomInputs() {
  const int kNumIters = 1 << 18;
  std::minstd_rand random(testing::UnitTest::GetInstance()->random_seed());
  std::uniform_int_distribution<uint64_t> uniform_uint64;
  for (int i = 0; i < kNumIters; ++i) {
    const T a = T(uniform_uint64(random), uniform_uint64(random));
    const T b = T(uniform_uint64(random), uniform_uint64(random));
    if (b == 0) {
      continue;  // Avoid a div-by-zero.
    }
    const T q = a / b;
    const T r = a % b;
    ASSERT_EQ(a, b * q + r);
  }
}

TEST(Uint128, DivideAndMod) {
  TestDivision<DivisionImpl>();
  TestDivision<LLVMDivision>();
  TestDivision<MyDivision1>();
  TestDivision<MyDivision2>();
  TestDivision<GmpDivision>();
  TestDivision<LibDivideDivision>();
}

TEST(Uint128, DivideAndModRandomInputs) {
  TestRandomInputs<DivisionImpl>();
  TestRandomInputs<LLVMDivision>();
  TestRandomInputs<MyDivision1>();
  TestRandomInputs<MyDivision2>();
  TestRandomInputs<GmpDivision>();
  TestRandomInputs<LibDivideDivision>();
}

constexpr size_t kSampleSize = 1000000;

std::mt19937 MakeRandomEngine() {
  std::random_device r;
  std::seed_seq seed({r(), r(), r(), r(), r(), r(), r(), r()});
  return std::mt19937(seed);
}

template <typename T, typename H = typename std::conditional<false, int64_t,
                                                             uint64_t>::type>
std::vector<std::pair<T, T>> GetRandomClass128SampleUniformDivisor() {
  std::vector<std::pair<T, T>> values;
  std::mt19937 random = MakeRandomEngine();
  std::uniform_int_distribution<H> uniform_h;
  values.reserve(kSampleSize);
  for (size_t i = 0; i < kSampleSize; ++i) {
    T a{absl::MakeUint128(uniform_h(random), uniform_h(random))};
    T b{absl::MakeUint128(uniform_h(random), uniform_h(random))};
    values.emplace_back(std::max(a, b), std::max(T(2), std::min(a, b)));
  }
  return values;
}

template <typename T>
void BM_DivideClass128UniformDivisor(benchmark::State& state) {
  auto values = GetRandomClass128SampleUniformDivisor<T>();
  while (state.KeepRunningBatch(values.size())) {
    for (const auto& pair : values) {
      benchmark::DoNotOptimize(pair.first / pair.second);
    }
  }
}

template <typename T>
void BM_RemainderClass128UniformDivisor(benchmark::State& state) {
  auto values = GetRandomClass128SampleUniformDivisor<T>();
  while (state.KeepRunningBatch(values.size())) {
    for (const auto& pair : values) {
      benchmark::DoNotOptimize(pair.first % pair.second);
    }
  }
}

template <typename T, typename H = typename std::conditional<false, int64_t,
                                                             uint64_t>::type>
std::vector<std::pair<T, H>> GetRandomClass128SampleSmallDivisor() {
  std::vector<std::pair<T, H>> values;
  std::mt19937 random = MakeRandomEngine();
  std::uniform_int_distribution<H> uniform_h;
  values.reserve(kSampleSize);
  for (size_t i = 0; i < kSampleSize; ++i) {
    T a{absl::MakeUint128(uniform_h(random), uniform_h(random))};
    H b{std::max(H{2}, uniform_h(random))};
    values.emplace_back(std::max(a, T(b)), b);
  }
  return values;
}

template <typename T>
void BM_DivideClass128SmallDivisor(benchmark::State& state) {
  auto values = GetRandomClass128SampleSmallDivisor<T>();
  while (state.KeepRunningBatch(values.size())) {
    for (const auto& pair : values) {
      benchmark::DoNotOptimize(pair.first / pair.second);
    }
  }
}

template <typename T>
void BM_RemainderClass128SmallDivisor(benchmark::State& state) {
  auto values = GetRandomClass128SampleSmallDivisor<T>();
  while (state.KeepRunningBatch(values.size())) {
    for (const auto& pair : values) {
      benchmark::DoNotOptimize(pair.first % pair.second);
    }
  }
}
#define BENCH_TYPE(X)                                        \
  BENCHMARK_TEMPLATE(BM_DivideClass128UniformDivisor, X);    \
  BENCHMARK_TEMPLATE(BM_RemainderClass128UniformDivisor, X); \
  BENCHMARK_TEMPLATE(BM_DivideClass128SmallDivisor, X);      \
  BENCHMARK_TEMPLATE(BM_RemainderClass128SmallDivisor, X)

BENCH_TYPE(absl::uint128);
BENCH_TYPE(LLVMDivision);
BENCH_TYPE(MyDivision1);
BENCH_TYPE(MyDivision2);
BENCH_TYPE(GmpDivision);
BENCH_TYPE(LibDivideDivision);

}
	// Installation.
	// Abseil, compile with -labseil
	// Libdivide as header.
	// GMP as header, compile with -lgmp
	// Change the private constructor of absl::uin128 to public in order to use public inheritance.
	// GMOCK, GUNIT, compile it.
	#include "absl/numeric/int128.h"

	#include <random>

	#include "libdivide.h"
	#include "benchmark/benchmark.h"
	#include "base/public/gmock.h"
	#include "base/public/gunit.h"
	#include "gmp.h"

	typedef unsigned tu_int __attribute__((mode(TI)));
	typedef int ti_int __attribute__((mode(TI)));
	typedef uint64_t du_int;
	typedef uint32_t su_int;

	typedef union {
	tu_int all;
	struct {
	du_int low;
	du_int high;
	} s;
	} utwords;

	tu_int LLVM_divmod(tu_int a, tu_int b, tu_int* rem) {
	const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT;
	const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
	utwords n;
	n.all = a;
	utwords d;
	d.all = b;
	utwords q;
	utwords r;
	unsigned sr;
	// special cases, X is unknown, K != 0
	if (n.s.high == 0) {
	if (d.s.high == 0) {
	// 0 X
	// ---
	// 0 X
	if (rem) *rem = n.s.low % d.s.low;
	return n.s.low / d.s.low;
	}
	// 0 X
	// ---
	// K X
	if (rem) *rem = n.s.low;
	return 0;
	}
	// n.s.high != 0
	if (d.s.low == 0) {
	if (d.s.high == 0) {
	// K X
	// ---
	// 0 0
	if (rem) *rem = n.s.high % d.s.low;
	return n.s.high / d.s.low;
	}
	// d.s.high != 0
	if (n.s.low == 0) {
	// K 0
	// ---
	// K 0
	if (rem) {
	r.s.high = n.s.high % d.s.high;
	r.s.low = 0;
	*rem = r.all;
	}
	return n.s.high / d.s.high;
	}
	// K K
	// ---
	// K 0
	if ((d.s.high & (d.s.high - 1)) == 0) /* if d is a power of 2 */ {
	if (rem) {
	r.s.low = n.s.low;
	r.s.high = n.s.high & (d.s.high - 1);
	*rem = r.all;
	}
	return n.s.high >> __builtin_ctzll(d.s.high);
	}
	// K K
	// ---
	// K 0
	sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high);
	// 0 <= sr <= n_udword_bits - 2 or sr large
	if (sr > n_udword_bits - 2) {
	if (rem) *rem = n.all;
	return 0;
	}
	++sr;
	// 1 <= sr <= n_udword_bits - 1
	// q.all = n.all << (n_utword_bits - sr);
	q.s.low = 0;
	q.s.high = n.s.low << (n_udword_bits - sr);
	// r.all = n.all >> sr;
	r.s.high = n.s.high >> sr;
	r.s.low = (n.s.high << (n_udword_bits - sr)) \| (n.s.low >> sr);
	} else /* d.s.low != 0 */ {
	if (d.s.high == 0) {
	// K X
	// ---
	// 0 K
	if ((d.s.low & (d.s.low - 1)) == 0) /* if d is a power of 2 */ {
	if (rem) *rem = n.s.low & (d.s.low - 1);
	if (d.s.low == 1) return n.all;
	sr = __builtin_ctzll(d.s.low);
	q.s.high = n.s.high >> sr;
	q.s.low = (n.s.high << (n_udword_bits - sr)) \| (n.s.low >> sr);
	return q.all;
	}
	// K X
	// ---
	// 0 K
	sr = 1 + n_udword_bits + __builtin_clzll(d.s.low) -
	__builtin_clzll(n.s.high);
	// 2 <= sr <= n_utword_bits - 1
	// q.all = n.all << (n_utword_bits - sr);
	// r.all = n.all >> sr;
	if (sr == n_udword_bits) {
	q.s.low = 0;
	q.s.high = n.s.low;
	r.s.high = 0;
	r.s.low = n.s.high;
	} else if (sr < n_udword_bits) /* 2 <= sr <= n_udword_bits - 1 */ {
	q.s.low = 0;
	q.s.high = n.s.low << (n_udword_bits - sr);
	r.s.high = n.s.high >> sr;
	r.s.low = (n.s.high << (n_udword_bits - sr)) \| (n.s.low >> sr);
	} else /* n_udword_bits + 1 <= sr <= n_utword_bits - 1 */ {
	q.s.low = n.s.low << (n_utword_bits - sr);
	q.s.high = (n.s.high << (n_utword_bits - sr)) \|
	(n.s.low >> (sr - n_udword_bits));
	r.s.high = 0;
	r.s.low = n.s.high >> (sr - n_udword_bits);
	}
	} else {
	// K X
	// ---
	// K K
	sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high);
	// 0 <= sr <= n_udword_bits - 1 or sr large
	if (sr > n_udword_bits - 1) {
	if (rem) *rem = n.all;
	return 0;
	}
	++sr;
	// 1 <= sr <= n_udword_bits
	// q.all = n.all << (n_utword_bits - sr);
	// r.all = n.all >> sr;
	q.s.low = 0;
	if (sr == n_udword_bits) {
	q.s.high = n.s.low;
	r.s.high = 0;
	r.s.low = n.s.high;
	} else {
	r.s.high = n.s.high >> sr;
	r.s.low = (n.s.high << (n_udword_bits - sr)) \| (n.s.low >> sr);
	q.s.high = n.s.low << (n_udword_bits - sr);
	}
	}
	}
	// Not a special case
	// q and r are initialized with:
	// q.all = n.all << (n_utword_bits - sr);
	// r.all = n.all >> sr;
	// 1 <= sr <= n_utword_bits - 1
	su_int carry = 0;
	for (; sr > 0; --sr) {
	// r:q = ((r:q) << 1) \| carry
	r.s.high = (r.s.high << 1) \| (r.s.low >> (n_udword_bits - 1));
	r.s.low = (r.s.low << 1) \| (q.s.high >> (n_udword_bits - 1));
	q.s.high = (q.s.high << 1) \| (q.s.low >> (n_udword_bits - 1));
	q.s.low = (q.s.low << 1) \| carry;
	// carry = 0;
	// if (r.all >= d.all)
	// {
	// r.all -= d.all;
	// carry = 1;
	// }
	const ti_int s = (ti_int)(d.all - r.all - 1) >> (n_utword_bits - 1);
	carry = s & 1;
	r.all -= d.all & s;
	}
	q.all = (q.all << 1) \| carry;
	if (rem) *rem = r.all;
	return q.all;
	}

	inline du_int Divide128div64to64Primitive(du_int u1, du_int u0, du_int v,
	du_int* r) {
	// Code taken from Hacker's Delight:
	// http://www.hackersdelight.org/HDcode/divlu.c.
	// License permits inclusion here per:
	// http://www.hackersdelight.org/permissions.htm

	const uint64_t b = (1ULL << 32); // Number base (32 bits)
	uint64_t un1, un0; // Norm. dividend LSD's
	uint64_t vn1, vn0; // Norm. divisor digits
	uint64_t q1, q0; // Quotient digits
	uint64_t un64, un21, un10; // Dividend digit pairs
	uint64_t rhat; // A remainder
	int32_t s; // Shift amount for norm

	// If overflow, set rem. to an impossible value,
	// and return the largest possible quotient
	if (u1 >= v) {
	*r = (uint64_t)-1;
	return (uint64_t)-1;
	}

	// count leading zeros
	s = __builtin_clzll(v);
	if (s > 0) {
	// Normalize divisor
	v = v << s;
	un64 = (u1 << s) \| (u0 >> (64 - s));
	un10 = u0 << s; // Shift dividend left
	} else {
	// Avoid undefined behavior of (u0 >> 64).
	// The behavior is undefined if the right operand is
	// negative, or greater than or equal to the length
	// in bits of the promoted left operand.
	un64 = u1;
	un10 = u0;
	}

	// Break divisor up into two 32-bit digits
	vn1 = v >> 32;
	vn0 = v & 0xFFFFFFFF;

	// Break right half of dividend into two digits
	un1 = un10 >> 32;
	un0 = un10 & 0xFFFFFFFF;

	// Compute the first quotient digit, q1
	q1 = un64 / vn1;
	rhat = un64 - q1 * vn1;

	while (q1 >= b \|\| q1 * vn0 > b * rhat + un1) {
	q1 = q1 - 1;
	rhat = rhat + vn1;
	if (rhat >= b) break;
	}

	// Multiply and subtract
	un21 = un64 * b + un1 - q1 * v;

	// Compute the second quotient digit
	q0 = un21 / vn1;
	rhat = un21 - q0 * vn1;

	while (q0 >= b \|\| q0 * vn0 > b * rhat + un0) {
	q0 = q0 - 1;
	rhat = rhat + vn1;
	if (rhat >= b) break;
	}

	r = (un21 b + un0 - q0 * v) >> s;
	return q1 * b + q0;
	}

	inline du_int Divide128div64to64(du_int u1, du_int u0, du_int v, du_int* r) {
	#if defined(__x86_64__)
	du_int result;
	__asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
	return result;
	#else
	return Divide128div64to64Primitive(u1, u0, v, r);
	#endif
	}

	struct OptimizedDiv {
	du_int operator()(du_int u1, du_int u0, du_int v, du_int* r) const {
	return Divide128div64to64(u1, u0, v, r);
	}
	};

	struct PrimitiveDiv {
	du_int operator()(du_int u1, du_int u0, du_int v, du_int* r) const {
	return Divide128div64to64Primitive(u1, u0, v, r);
	}
	};

	struct GmpDiv {
	du_int operator()(du_int u1, du_int u0, du_int v, du_int* r) const {
	mp_limb_t q[2] = {u0, u1};
	mp_limb_t result[2] = {0, 0};
	*r = mpn_divrem_1(result, 0, q, 2, v);
	return result[0];
	}
	};

	template <typename Div128by64>
	tu_int MyDivMod1(tu_int a, tu_int b, tu_int* rem) {
	const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
	(void)n_utword_bits;
	utwords dividend;
	dividend.all = a;
	utwords divisor;
	divisor.all = b;
	utwords quotient;
	utwords remainder;
	if (divisor.s.high == 0) {
	remainder.s.high = 0;
	if (dividend.s.high < divisor.s.low) {
	quotient.s.low = Div128by64{}(dividend.s.high, dividend.s.low,
	divisor.s.low, &remainder.s.low);
	quotient.s.high = 0;
	} else {
	quotient.s.high =
	Div128by64{}(0, dividend.s.high, divisor.s.low, &dividend.s.high);
	quotient.s.low = Div128by64{}(dividend.s.high, dividend.s.low,
	divisor.s.low, &remainder.s.low);
	}
	if (rem) *rem = remainder.all;
	return quotient.all;
	}
	int shift =
	__builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high);
	divisor.all <<= shift;
	quotient.s.high = 0;
	quotient.s.low = 0;
	for (; shift >= 0; --shift) {
	quotient.s.low <<= 1;
	// const ti_int s = (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1);
	// quotient.s.low \|= s & 1;
	if (dividend.all >= divisor.all) {
	dividend.all -= divisor.all;
	quotient.s.low \|= 1;
	}
	// dividend.all -= divisor.all & s;
	divisor.all >>= 1;
	}
	if (rem) *rem = dividend.all;
	return quotient.all;
	}

	template <typename Div128by64>
	tu_int MyDivMod2(tu_int a, tu_int b, tu_int* rem) {
	utwords dividend;
	dividend.all = a;
	utwords divisor;
	divisor.all = b;
	utwords quotient;
	utwords remainder;
	if (divisor.all > dividend.all) {
	if (rem) *rem = dividend.all;
	return 0;
	}
	if (divisor.s.high == 0) {
	remainder.s.high = 0;
	if (dividend.s.high < divisor.s.low) {
	quotient.s.low = Div128by64{}(dividend.s.high, dividend.s.low,
	divisor.s.low, &remainder.s.low);
	quotient.s.high = 0;
	} else {
	quotient.s.high =
	Div128by64{}(0, dividend.s.high, divisor.s.low, &dividend.s.high);
	quotient.s.low = Div128by64{}(dividend.s.high, dividend.s.low,
	divisor.s.low, &remainder.s.low);
	}
	if (rem) *rem = remainder.all;
	return quotient.all;
	}
	quotient.s.high = 0;
	quotient.s.low = libdivide::libdivide_128_div_128_to_64(
	dividend.s.high, dividend.s.low, divisor.s.high, divisor.s.low,
	&remainder.s.high, &remainder.s.low);
	if (rem) {
	*rem = remainder.all;
	}
	return quotient.all;
	}

	struct DivisionImpl : public absl::uint128 {
	using Base = absl::uint128;
	using Base::Base;
	DivisionImpl(absl::uint128 u) : Base(u) {}
	};

	struct LLVMDivision : public absl::uint128 {
	using Base = absl::uint128;
	using Base::Base;
	LLVMDivision(absl::uint128 u) : Base(u) {}
	};

	absl::uint128 operator/(const LLVMDivision& lhs, const LLVMDivision& rhs) {
	utwords lhs_u;
	utwords rhs_u;
	lhs_u.s.low = absl::Uint128Low64(lhs);
	lhs_u.s.high = absl::Uint128High64(lhs);
	rhs_u.s.low = absl::Uint128Low64(rhs);
	rhs_u.s.high = absl::Uint128High64(rhs);
	utwords n;
	n.all = LLVM_divmod(lhs_u.all, rhs_u.all, nullptr);
	return absl::MakeUint128(n.s.high, n.s.low);
	}

	absl::uint128 operator%(const LLVMDivision& lhs, const LLVMDivision& rhs) {
	utwords lhs_u;
	utwords rhs_u;
	lhs_u.s.low = absl::Uint128Low64(lhs);
	lhs_u.s.high = absl::Uint128High64(lhs);
	rhs_u.s.low = absl::Uint128Low64(rhs);
	rhs_u.s.high = absl::Uint128High64(rhs);
	utwords rem;
	LLVM_divmod(lhs_u.all, rhs_u.all, &rem.all);
	return absl::MakeUint128(rem.s.high, rem.s.low);
	}

	struct MyDivision1 : public absl::uint128 {
	using Base = absl::uint128;
	using Base::Base;
	MyDivision1(absl::uint128 u) : Base(u) {}
	};

	absl::uint128 operator/(const MyDivision1& lhs, const MyDivision1& rhs) {
	utwords lhs_u;
	utwords rhs_u;
	lhs_u.s.low = absl::Uint128Low64(lhs);
	lhs_u.s.high = absl::Uint128High64(lhs);
	rhs_u.s.low = absl::Uint128Low64(rhs);
	rhs_u.s.high = absl::Uint128High64(rhs);
	utwords n;
	n.all = MyDivMod1<OptimizedDiv>(lhs_u.all, rhs_u.all, nullptr);
	return absl::MakeUint128(n.s.high, n.s.low);
	}

	absl::uint128 operator%(const MyDivision1& lhs, const MyDivision1& rhs) {
	utwords lhs_u;
	utwords rhs_u;
	lhs_u.s.low = absl::Uint128Low64(lhs);
	lhs_u.s.high = absl::Uint128High64(lhs);
	rhs_u.s.low = absl::Uint128Low64(rhs);
	rhs_u.s.high = absl::Uint128High64(rhs);
	utwords rem;
	MyDivMod1<OptimizedDiv>(lhs_u.all, rhs_u.all, &rem.all);
	return absl::MakeUint128(rem.s.high, rem.s.low);
	}

	struct MyDivision2 : public absl::uint128 {
	using Base = absl::uint128;
	using Base::Base;
	MyDivision2(absl::uint128 u) : Base(u) {}
	};

	absl::uint128 operator/(const MyDivision2& lhs, const MyDivision2& rhs) {
	utwords lhs_u;
	utwords rhs_u;
	lhs_u.s.low = absl::Uint128Low64(lhs);
	lhs_u.s.high = absl::Uint128High64(lhs);
	rhs_u.s.low = absl::Uint128Low64(rhs);
	rhs_u.s.high = absl::Uint128High64(rhs);
	utwords n;
	n.all = MyDivMod1<PrimitiveDiv>(lhs_u.all, rhs_u.all, nullptr);
	return absl::MakeUint128(n.s.high, n.s.low);
	}

	absl::uint128 operator%(const MyDivision2& lhs, const MyDivision2& rhs) {
	utwords lhs_u;
	utwords rhs_u;
	lhs_u.s.low = absl::Uint128Low64(lhs);
	lhs_u.s.high = absl::Uint128High64(lhs);
	rhs_u.s.low = absl::Uint128Low64(rhs);
	rhs_u.s.high = absl::Uint128High64(rhs);
	utwords rem;
	MyDivMod1<PrimitiveDiv>(lhs_u.all, rhs_u.all, &rem.all);
	return absl::MakeUint128(rem.s.high, rem.s.low);
	}

	struct GmpDivision : public absl::uint128 {
	using Base = absl::uint128;
	using Base::Base;
	GmpDivision(absl::uint128 u) : Base(u) {}
	};

	absl::uint128 operator/(const GmpDivision& lhs, const GmpDivision& rhs) {
	utwords lhs_u;
	utwords rhs_u;
	lhs_u.s.low = absl::Uint128Low64(lhs);
	lhs_u.s.high = absl::Uint128High64(lhs);
	rhs_u.s.low = absl::Uint128Low64(rhs);
	rhs_u.s.high = absl::Uint128High64(rhs);
	utwords n;
	n.all = MyDivMod1<GmpDiv>(lhs_u.all, rhs_u.all, nullptr);
	return absl::MakeUint128(n.s.high, n.s.low);
	}

	absl::uint128 operator%(const GmpDivision& lhs, const GmpDivision& rhs) {
	utwords lhs_u;
	utwords rhs_u;
	lhs_u.s.low = absl::Uint128Low64(lhs);
	lhs_u.s.high = absl::Uint128High64(lhs);
	rhs_u.s.low = absl::Uint128Low64(rhs);
	rhs_u.s.high = absl::Uint128High64(rhs);
	utwords rem;
	MyDivMod1<GmpDiv>(lhs_u.all, rhs_u.all, &rem.all);
	return absl::MakeUint128(rem.s.high, rem.s.low);
	}


	struct LibDivideDivision : public absl::uint128 {
	using Base = absl::uint128;
	using Base::Base;
	LibDivideDivision(absl::uint128 u) : Base(u) {}
	};

	absl::uint128 operator/(const LibDivideDivision& lhs, const LibDivideDivision& rhs) {
	utwords lhs_u;
	utwords rhs_u;
	lhs_u.s.low = absl::Uint128Low64(lhs);
	lhs_u.s.high = absl::Uint128High64(lhs);
	rhs_u.s.low = absl::Uint128Low64(rhs);
	rhs_u.s.high = absl::Uint128High64(rhs);
	utwords n;
	n.all = MyDivMod2<OptimizedDiv>(lhs_u.all, rhs_u.all, nullptr);
	return absl::MakeUint128(n.s.high, n.s.low);
	}

	absl::uint128 operator%(const LibDivideDivision& lhs, const LibDivideDivision& rhs) {
	utwords lhs_u;
	utwords rhs_u;
	lhs_u.s.low = absl::Uint128Low64(lhs);
	lhs_u.s.high = absl::Uint128High64(lhs);
	rhs_u.s.low = absl::Uint128Low64(rhs);
	rhs_u.s.high = absl::Uint128High64(rhs);
	utwords rem;
	MyDivMod2<OptimizedDiv>(lhs_u.all, rhs_u.all, &rem.all);
	return absl::MakeUint128(rem.s.high, rem.s.low);
	}

	namespace {

	template <typename T>
	void TestDivision() {
	using std::swap;
	// a := q * b + r
	T a, b, q, r;

	// Zero test.
	a = 0;
	b = 123;
	q = a / b;
	r = a % b;
	EXPECT_EQ(0, q);
	EXPECT_EQ(0, r);

	a = T(0x530eda741c71d4c3, 0xbf25975319080000);
	q = T(0x4de2cab081, 0x14c34ab4676e4bab);
	b = T(0x1110001);
	r = T(0x3eb455);
	ASSERT_EQ(a, q * b + r); // Sanity-check.

	T result_q, result_r;
	result_q = a / b;
	result_r = a % b;
	EXPECT_EQ(q, result_q);
	EXPECT_EQ(r, result_r);

	// Try the other way around.
	swap(q, b);
	result_q = a / b;
	result_r = a % b;
	EXPECT_EQ(q, result_q);
	EXPECT_EQ(r, result_r);
	// Restore.
	swap(b, q);

	// Dividend < divisor; result should be q:0 r:<dividend>.
	swap(a, b);
	result_q = a / b;
	result_r = a % b;
	EXPECT_EQ(0, result_q);
	EXPECT_EQ(a, result_r);
	// Try the other way around.
	swap(a, q);
	result_q = a / b;
	result_r = a % b;
	EXPECT_EQ(0, result_q);
	EXPECT_EQ(a, result_r);
	// Restore.
	swap(q, a);
	swap(b, a);

	// Try a large remainder.
	b = a / 2 + 1;
	T expected_r = T(0x29876d3a0e38ea61, 0xdf92cba98c83ffff);
	// Sanity checks.
	ASSERT_EQ(a / 2 - 1, expected_r);
	ASSERT_EQ(a, b + expected_r);
	result_q = a / b;
	result_r = a % b;
	EXPECT_EQ(1, result_q);
	EXPECT_EQ(expected_r, result_r);
	}

	template <typename T>
	void TestRandomInputs() {
	const int kNumIters = 1 << 18;
	std::minstd_rand random(testing::UnitTest::GetInstance()->random_seed());
	std::uniform_int_distribution<uint64_t> uniform_uint64;
	for (int i = 0; i < kNumIters; ++i) {
	const T a = T(uniform_uint64(random), uniform_uint64(random));
	const T b = T(uniform_uint64(random), uniform_uint64(random));
	if (b == 0) {
	continue; // Avoid a div-by-zero.
	}
	const T q = a / b;
	const T r = a % b;
	ASSERT_EQ(a, b * q + r);
	}
	}

	TEST(Uint128, DivideAndMod) {
	TestDivision<DivisionImpl>();
	TestDivision<LLVMDivision>();
	TestDivision<MyDivision1>();
	TestDivision<MyDivision2>();
	TestDivision<GmpDivision>();
	TestDivision<LibDivideDivision>();
	}

	TEST(Uint128, DivideAndModRandomInputs) {
	TestRandomInputs<DivisionImpl>();
	TestRandomInputs<LLVMDivision>();
	TestRandomInputs<MyDivision1>();
	TestRandomInputs<MyDivision2>();
	TestRandomInputs<GmpDivision>();
	TestRandomInputs<LibDivideDivision>();
	}

	constexpr size_t kSampleSize = 1000000;

	std::mt19937 MakeRandomEngine() {
	std::random_device r;
	std::seed_seq seed({r(), r(), r(), r(), r(), r(), r(), r()});
	return std::mt19937(seed);
	}

	template <typename T, typename H = typename std::conditional<false, int64_t,
	uint64_t>::type>
	std::vector<std::pair<T, T>> GetRandomClass128SampleUniformDivisor() {
	std::vector<std::pair<T, T>> values;
	std::mt19937 random = MakeRandomEngine();
	std::uniform_int_distribution<H> uniform_h;
	values.reserve(kSampleSize);
	for (size_t i = 0; i < kSampleSize; ++i) {
	T a{absl::MakeUint128(uniform_h(random), uniform_h(random))};
	T b{absl::MakeUint128(uniform_h(random), uniform_h(random))};
	values.emplace_back(std::max(a, b), std::max(T(2), std::min(a, b)));
	}
	return values;
	}

	template <typename T>
	void BM_DivideClass128UniformDivisor(benchmark::State& state) {
	auto values = GetRandomClass128SampleUniformDivisor<T>();
	while (state.KeepRunningBatch(values.size())) {
	for (const auto& pair : values) {
	benchmark::DoNotOptimize(pair.first / pair.second);
	}
	}
	}

	template <typename T>
	void BM_RemainderClass128UniformDivisor(benchmark::State& state) {
	auto values = GetRandomClass128SampleUniformDivisor<T>();
	while (state.KeepRunningBatch(values.size())) {
	for (const auto& pair : values) {
	benchmark::DoNotOptimize(pair.first % pair.second);
	}
	}
	}

	template <typename T, typename H = typename std::conditional<false, int64_t,
	uint64_t>::type>
	std::vector<std::pair<T, H>> GetRandomClass128SampleSmallDivisor() {
	std::vector<std::pair<T, H>> values;
	std::mt19937 random = MakeRandomEngine();
	std::uniform_int_distribution<H> uniform_h;
	values.reserve(kSampleSize);
	for (size_t i = 0; i < kSampleSize; ++i) {
	T a{absl::MakeUint128(uniform_h(random), uniform_h(random))};
	H b{std::max(H{2}, uniform_h(random))};
	values.emplace_back(std::max(a, T(b)), b);
	}
	return values;
	}

	template <typename T>
	void BM_DivideClass128SmallDivisor(benchmark::State& state) {
	auto values = GetRandomClass128SampleSmallDivisor<T>();
	while (state.KeepRunningBatch(values.size())) {
	for (const auto& pair : values) {
	benchmark::DoNotOptimize(pair.first / pair.second);
	}
	}
	}

	template <typename T>
	void BM_RemainderClass128SmallDivisor(benchmark::State& state) {
	auto values = GetRandomClass128SampleSmallDivisor<T>();
	while (state.KeepRunningBatch(values.size())) {
	for (const auto& pair : values) {
	benchmark::DoNotOptimize(pair.first % pair.second);
	}
	}
	}
	#define BENCH_TYPE(X) \
	BENCHMARK_TEMPLATE(BM_DivideClass128UniformDivisor, X); \
	BENCHMARK_TEMPLATE(BM_RemainderClass128UniformDivisor, X); \
	BENCHMARK_TEMPLATE(BM_DivideClass128SmallDivisor, X); \
	BENCHMARK_TEMPLATE(BM_RemainderClass128SmallDivisor, X)

	BENCH_TYPE(absl::uint128);
	BENCH_TYPE(LLVMDivision);
	BENCH_TYPE(MyDivision1);
	BENCH_TYPE(MyDivision2);
	BENCH_TYPE(GmpDivision);
	BENCH_TYPE(LibDivideDivision);

	}