davedice/MonoTime.cc

## MonoTime.cc
// Build: g++ -O3 -std=gnu++14 -m64 MonoTime.cc -lpthread -o MonoTime -mmemory-model=tso
//
// Dave Dice -- blogs.oracle.com/dave

#include <thread>
#include <chrono>
#include <iostream>
#include <vector>
#include <mutex>
#include <random>
#include <atomic>
#include <alloca.h>
#include <cxxabi.h>

#include <sys/time.h>

template<typename Fun>
inline auto operator+(std::mutex & L, Fun&& fn) -> decltype(fn()) {
  std::lock_guard<std::mutex> Locker(L) ;
  return fn() ;
}

// Ideally we'd use alignas(128), but that isn't handled gracefully by all compilers
#define CALIGNED __attribute__ ((aligned(128)))

struct _Aligner {
  CALIGNED int a [0] ;
} ;

template<typename T> using Ref = T * ;
template<typename T> using AlignedType CALIGNED = T ;
using Av32t = CALIGNED volatile int32_t ;

static const auto LMax = [](auto x, auto y) { return (x >= y) ? x : y ; } ;
static const auto LMin = [](auto x, auto y) { return (x <= y) ? x : y ; } ;

template<typename T>
T CAS(std::atomic<T> * p, T Comparand, T Set) {
  T cpy = Comparand ;
  std::atomic_compare_exchange_strong (p, &cpy, Set) ;
  return cpy ;
}

template<typename T>
T CASV(std::atomic<T> * p, T Comparand, T Set) {
  std::atomic_compare_exchange_strong (p, &Comparand, Set) ;
  return Comparand ;
}

static const auto LCAS = [](auto ARef, auto Cmp, auto Set) {
  std::atomic_compare_exchange_strong (ARef, &Cmp, Set) ;
  return Cmp ;
} ;

static void FullFence() {
  std::atomic_thread_fence (std::memory_order_seq_cst) ;
}

static void CompilerFence() {
  // Sequence point
  std::atomic_thread_fence (std::memory_order_relaxed) ;
}


// MonoTime() : provides a non-retrograde causal clock
// Models HotSpot JVM nanoTime() implementation
//
// Remarks:
// *  Note that the fetch of TMax is SC by default : memory_order_seq_cst.
//    A weak load might suffice as the value is validated via CAS
//    and CAS has strong fence semantics.
// *  We use an anonymous union to sequester and isolate TMax as sole occupant
//    of its cache line.
// *  MonoTime() uses an abstract base, so adding a small episilon is benign
//    That is, the implement has latitute to add "Grain".
//    MonoTime() is uncorrelated with other clocks - not comparable
//    It should be used only for relative time, not wall-clock absolute time.
// *  Note that the grain of the underlying gethrtime() clock source
//    already influences the peak update rate.
// *  A useful microoptimization is to LD Grain before the LD-CAS window
// *  Being able to set Grain gives us a useful test to see if an application
//    running on a system is sensitive to the nanoTime coherence hotspot.
// *  Requires only a one-line changeset relative to baseline : add Grain
// *  Assumes the skew-drift is extremely small
// *  This example uses gethrtime() on Solaris
//    We could also use "RD STICK" directly.
//    Substitute your favor clock source accordingly.
//    On modern Linux/x86 systems clock_gettime (CLOCK_MONOTONIC...) is a good choice,
//    assuming it's implemented via VDSO.
//    You might also use RDTSCP directly.
// *  MonoTime() is expected to be causal in the following sense.
//    Say thread A calls MonoTime() which returns value V.
//    A stores V into some memory location.
//    Thread B fetches from that location and observes the store, fetching V.
//    B then call MonoTime() which returns W
//    It should be the case that W >= V.

static hrtime_t Grain = 0 ;

static hrtime_t MonoTime() {
  static union { std::atomic<hrtime_t> CALIGNED TMax {0} ; _Aligner Ax; } ;
  hrtime_t now = gethrtime() ;
  const hrtime_t tmax = TMax ;
  if (tmax >= now) return tmax ;
  // Quantize "now" value up to reduce subsequent update rate
  // Attempt to reduce write rate into TMax to avoid coherence hotspot
  // Grain value reflects trade-off between update rate and granularity of MonoTime() values
  now += Grain ;
  const hrtime_t v = LCAS (&TMax, tmax, now) ;
  // Note that we can trivially make the following branch-free, but it's less readable.
  if (v == tmax) return now ;
  // claim : v > tmax
  return v ;
}

int main (int argc, char * argv []) {
  setbuf (stdout, NULL) ;
  auto nthreads = atoi(argv[1]);
  Grain = atoi (argv[2]) ;
  printf ("%d threads on %d CPUs; Grain=%lld\n",
    nthreads,
    std::thread::hardware_concurrency(),
    Grain) ;

  double Duration = 10.0;

  // beware of false sharing ...
  std::mutex CALIGNED mut {};
  int64_t CALIGNED tally   = 0 ;
  volatile bool CALIGNED done = false;

  std::vector<std::thread> threads;
  for (int i = 0; i < nthreads; i++) {
    threads.emplace_back(std::thread([&](){
      int64_t steps = 0 ;
      hrtime_t sum = 0 ;
      hrtime_t prv = MonoTime() ;
      while (!done) {
        ++steps ;
        const auto v = MonoTime() ;
        if (v < prv) printf ("Error %llX-%llX %lld ", v, prv, prv-v) ;
        prv = v ;
        sum += v ;
      }
      if (sum == 0x11) printf ("!") ;
      mut + [&] { tally += steps; } ;
      printf ("%d ", steps) ;
    }));
  }

  std::this_thread::sleep_for(std::chrono::duration<double>(Duration));
  done = true;

  for (auto &t : threads) {
    t.join();
  }

  printf ("\n") ;
  std::cout << tally << " counted in " << Duration << " seconds (" << (tally/Duration) << " per sec)\n";
  printf ("\n") ;

  return 0;
}
	// Build: g++ -O3 -std=gnu++14 -m64 MonoTime.cc -lpthread -o MonoTime -mmemory-model=tso
	//
	// Dave Dice -- blogs.oracle.com/dave

	#include <thread>
	#include <chrono>
	#include <iostream>
	#include <vector>
	#include <mutex>
	#include <random>
	#include <atomic>
	#include <alloca.h>
	#include <cxxabi.h>

	#include <sys/time.h>

	template<typename Fun>
	inline auto operator+(std::mutex & L, Fun&& fn) -> decltype(fn()) {
	std::lock_guard<std::mutex> Locker(L) ;
	return fn() ;
	}

	// Ideally we'd use alignas(128), but that isn't handled gracefully by all compilers
	#define CALIGNED __attribute__ ((aligned(128)))

	struct _Aligner {
	CALIGNED int a [0] ;
	} ;

	template<typename T> using Ref = T * ;
	template<typename T> using AlignedType CALIGNED = T ;
	using Av32t = CALIGNED volatile int32_t ;

	static const auto LMax = [](auto x, auto y) { return (x >= y) ? x : y ; } ;
	static const auto LMin = [](auto x, auto y) { return (x <= y) ? x : y ; } ;

	template<typename T>
	T CAS(std::atomic<T> * p, T Comparand, T Set) {
	T cpy = Comparand ;
	std::atomic_compare_exchange_strong (p, &cpy, Set) ;
	return cpy ;
	}

	template<typename T>
	T CASV(std::atomic<T> * p, T Comparand, T Set) {
	std::atomic_compare_exchange_strong (p, &Comparand, Set) ;
	return Comparand ;
	}

	static const auto LCAS = [](auto ARef, auto Cmp, auto Set) {
	std::atomic_compare_exchange_strong (ARef, &Cmp, Set) ;
	return Cmp ;
	} ;

	static void FullFence() {
	std::atomic_thread_fence (std::memory_order_seq_cst) ;
	}

	static void CompilerFence() {
	// Sequence point
	std::atomic_thread_fence (std::memory_order_relaxed) ;
	}


	// MonoTime() : provides a non-retrograde causal clock
	// Models HotSpot JVM nanoTime() implementation
	//
	// Remarks:
	// * Note that the fetch of TMax is SC by default : memory_order_seq_cst.
	// A weak load might suffice as the value is validated via CAS
	// and CAS has strong fence semantics.
	// * We use an anonymous union to sequester and isolate TMax as sole occupant
	// of its cache line.
	// * MonoTime() uses an abstract base, so adding a small episilon is benign
	// That is, the implement has latitute to add "Grain".
	// MonoTime() is uncorrelated with other clocks - not comparable
	// It should be used only for relative time, not wall-clock absolute time.
	// * Note that the grain of the underlying gethrtime() clock source
	// already influences the peak update rate.
	// * A useful microoptimization is to LD Grain before the LD-CAS window
	// * Being able to set Grain gives us a useful test to see if an application
	// running on a system is sensitive to the nanoTime coherence hotspot.
	// * Requires only a one-line changeset relative to baseline : add Grain
	// * Assumes the skew-drift is extremely small
	// * This example uses gethrtime() on Solaris
	// We could also use "RD STICK" directly.
	// Substitute your favor clock source accordingly.
	// On modern Linux/x86 systems clock_gettime (CLOCK_MONOTONIC...) is a good choice,
	// assuming it's implemented via VDSO.
	// You might also use RDTSCP directly.
	// * MonoTime() is expected to be causal in the following sense.
	// Say thread A calls MonoTime() which returns value V.
	// A stores V into some memory location.
	// Thread B fetches from that location and observes the store, fetching V.
	// B then call MonoTime() which returns W
	// It should be the case that W >= V.

	static hrtime_t Grain = 0 ;

	static hrtime_t MonoTime() {
	static union { std::atomic<hrtime_t> CALIGNED TMax {0} ; _Aligner Ax; } ;
	hrtime_t now = gethrtime() ;
	const hrtime_t tmax = TMax ;
	if (tmax >= now) return tmax ;
	// Quantize "now" value up to reduce subsequent update rate
	// Attempt to reduce write rate into TMax to avoid coherence hotspot
	// Grain value reflects trade-off between update rate and granularity of MonoTime() values
	now += Grain ;
	const hrtime_t v = LCAS (&TMax, tmax, now) ;
	// Note that we can trivially make the following branch-free, but it's less readable.
	if (v == tmax) return now ;
	// claim : v > tmax
	return v ;
	}

	int main (int argc, char * argv []) {
	setbuf (stdout, NULL) ;
	auto nthreads = atoi(argv[1]);
	Grain = atoi (argv[2]) ;
	printf ("%d threads on %d CPUs; Grain=%lld\n",
	nthreads,
	std::thread::hardware_concurrency(),
	Grain) ;

	double Duration = 10.0;

	// beware of false sharing ...
	std::mutex CALIGNED mut {};
	int64_t CALIGNED tally = 0 ;
	volatile bool CALIGNED done = false;

	std::vector<std::thread> threads;
	for (int i = 0; i < nthreads; i++) {
	threads.emplace_back(std::thread([&](){
	int64_t steps = 0 ;
	hrtime_t sum = 0 ;
	hrtime_t prv = MonoTime() ;
	while (!done) {
	++steps ;
	const auto v = MonoTime() ;
	if (v < prv) printf ("Error %llX-%llX %lld ", v, prv, prv-v) ;
	prv = v ;
	sum += v ;
	}
	if (sum == 0x11) printf ("!") ;
	mut + [&] { tally += steps; } ;
	printf ("%d ", steps) ;
	}));
	}

	std::this_thread::sleep_for(std::chrono::duration<double>(Duration));
	done = true;

	for (auto &t : threads) {
	t.join();
	}

	printf ("\n") ;
	std::cout << tally << " counted in " << Duration << " seconds (" << (tally/Duration) << " per sec)\n";
	printf ("\n") ;

	return 0;
	}