davedice/InflatableSeqLock.cpp

## InflatableSeqLock.cpp
// InflatableSeqLock
// Copyright(C) 2016 Oracle and/or its affiliates
// Dave Dice : https://blogs.oracle.com/dave
//
// Remarks:
// *  Implements composite writer mutex and seqlock in a single word-sized lock.
//    Allows optimistic reading and pessimistic writing.
//    Readers don't write to shared synchronization metadata, which helps avoid
//    coherence traffic.  Readers must tolerate observing inconsistent state, however.
// *  Writer mutex is based on LIFO-CR lock from http://arxiv.org/abs/1511.06035.
// *  If 2 words were available then we'd simply use a (SeqLock,WriterLock) tuple.
//    The writerlock could be a classic LIFO-CR lock.
//    We'd update the SeqLock least significant bit to reflect the WriterLock state.
//    This can be accomplished with normal non-atomic accesses.
//    Additional fencing is required for those non-atomic SeqLock updates.
// *  Does not support reentrant locking -- nesting
//    The identity of lock owner is anonymous
// *  Optimistic readers that fail repeatedly on the same episode are
//    expected to fall back to pessimistic write locking to ensure progress.
// *  Because we're interested in footprint and density, a single-word lock will be
//    more vulnerable to false sharing for contended or promiscuous use cases.
// *  LIFO-CR properties :
//    +  single-word lock
//    +  succession by direct handoff
//    +  aggressive self-deflating locks -- prompt deflation
//    +  Uses Spin-then-park polite waiting ; no unbounded spinning
//    +  provides local spinning in spin-then-park waiting
//    +  Requires CAS in both lock and unlock fast paths
//    +  Explicit lock-free stack of waiters
//       The stacks are immune to the ABA problem as access is single-consumer-multiple-producer.
//       The SC safety property derives from lock itself.
//       Only the lock owner can pop.
//    +  When writer threads are waiting, the upper bits of the SeqLock field point to
//       the head (top) of a stack of waiting threads.
//       The set of waiting threads are linked together via on-stack "PushNode" elements.
//    +  uses Bernoulli trials for long-term fairness
//       Occasionally select the tail of the stack for succession.
//       Unfair over the short-term but enforces long-term fairness.
//       A classic seqlock that uses the LSB as a test-and-set lock can exhibit
//       sustained long-term unfairness and starvation.
//    +  Through happenstance, the current implementation happens to be
//       "thread oblivious" in the sense that thread T1 might acquire the lock
//       and thread T2 could subsequently release the lock.
// *  A viable alternative to the LIFO-CR lock is the following:
//    Yoshihiro Oyama, Kenjiro Taura, Akinori Yonezawa
//    Executing Parallel Programs with Synchronization Bottlenecks Efficiently
//    PDSIA 1999
// *  If necessary, a writer can access the displaced inflatable seqlock word
//    by traversing the lock stack.  The displaced word is the terminal element.
// *  Lock word encoding = (Variant : CONTENDED : LOCKED)
//    Normal    : V:0:0     V is Version
//    Locked    : V:0:1     Writer is active
//    Contended : P:1:1     P is pointer to PushNode element
//    Illegal   : *:0:1
//    CONTENDED=1 implies LOCKED=1
//    LOCKED=0 implies CONTENDED=0
//    CONTENDED=1 implies remainder contains pointer to PushNode
//    CONTENDED=0 implies remainder contains version
// *  The implementation is not strictly complaint C++ given the casting
//    of bit operations on pointers.
//    We assume the low order 2 bits of pointers are available for our use,
//    which in turn presumes alignment of the referents.
// *  Invariant:
//    The displaced version number resides at the tail of the stack -- (V:0:1)
// *  We assume a 64-bit environment with 62 bits for the seqlock.
//    Roll-over is not a practical concern.
// *  The implementation may batch together groups of writes under one seqlock "increment"
//    episode.
//
// Remarks specific to the implementation sketch:
// *  For the purposes of explication, we've implemented waiting via unbounded spinning.
//    It is trivial, however, to convert the code to use park-unpark facilities.
// *  The implementation is conservatively over-fenced with std::memory_order_seq_cst.
//    Using more efficient and relaxed fencing is left as an exercise for the reader.
// *  Potential thrown exceptions in the lambdas are ignored.
// *  Various bits of support infrastructure are assumed.
// *  To make the code a bit shorter I've used gotos.

static const auto LCAS = [](auto ARef, auto Cmp, auto Set) {
  std::atomic_compare_exchange_strong (ARef, &Cmp, Set) ;
  return Cmp ;
} ;

class InflatableSeqLock {
 private:
  // ISeqLock tag and VERSION(1) encoding ...
  enum { LOCKED=1, CONTENDED=2, TAGMASK=3, VERSION1=4, } ;

  std::atomic<uintptr_t> ISeqLock {VERSION1} ;

  // Policies for succession ...
  static const enum {LIBERAL, STRICT} SuccessionMode {LIBERAL} ;

  static const double TailProbability = 0.0001 ;

  static_assert(sizeof(uintptr_t) == 8, "invariant") ;

  struct PushNode {
    std::atomic<uintptr_t> Next alignas(128) {0} ;
    std::atomic<int> Admit {0} ;
  } ;

  // Pass lock ownership to thread referenced by "t"
  // Unpark(t) as necessary
  void Resume (PushNode * t) {
    // FENCE LD-ST |ST
    ASSERT (t->Admit == 0) ;
    t->Admit = 1 ;
  }

 public:

  // Standard RAII wrapper
  class Guard {
   private:
    InflatableSeqLock & I ;
   public:
    Guard    (InflatableSeqLock & s) : I(s) { I.Lock() ; }
    Guard    (InflatableSeqLock * s) : I(*s) { I.Lock() ; }
    ~Guard() { I.Unlock() ; }
    Guard    (const Guard &) = delete ;
    Guard &  operator=(const Guard &) = delete ;
  } ;

  uintptr_t ReadVersion() { return ISeqLock ;}
  int IsLocked (uintptr_t v) { return (v & LOCKED) != 0; }
  int Validate (uintptr_t v) { return ((v & LOCKED) | (v ^ ISeqLock)) == 0; }

  void Lock() {
    uintptr_t w = ISeqLock ;
    Again : (0) ;
    if ((w & LOCKED) == 0) {
      // Uncontended lock fast path ...
      // Locked == 0 implies Contended == 0
      ASSERT ((w & CONTENDED) == 0) ;
      const uintptr_t v = LCAS (&ISeqLock, w, w|LOCKED) ;
      if (v == w) {
        return ;
      }
      w = v ;
      // CAS failed via remote interference; we lost the race
      // Inopportune interleaving
      goto Again ;
    }

    // Contended arrival ...
    // Push reference to TSelf onto arrival stack
    PushNode TSelf ;
    TSelf.Admit = 0 ;
    TSelf.Next  = w ;
    ASSERT ((uintptr_t(&TSelf) & TAGMASK) == 0) ;
    const uintptr_t v = LCAS (&ISeqLock, w, uintptr_t(&TSelf)|LOCKED|CONTENDED) ;
    if (v != w) {
      w = v ;
      goto Again ;
    }

    // Contended waiting phase ...
    // Should park() here
    while (TSelf.Admit == 0) {
      Pause() ;
    }

    // FENCE: LD | ST-LD
    ASSERT (IsLocked(ISeqLock)) ;
  }

  void Unlock () {
    // Consider: prefetch-for-write
    uintptr_t w = ISeqLock ;
    ASSERT (IsLocked(w)) ;
    if ((w & CONTENDED) == 0) {
      // Fast uncontended unlock path ...
      // Version number remains installed in ISeqLock
      // Transition V:0:1 --> V+1:0:0
      // Clear LOCKED and increment version field in ISeqLock
      // Stream of unlocked version numbers should be monotone ascending.
      uintptr_t v = LCAS (&ISeqLock, w, (w & ~TAGMASK)+VERSION1) ;
      if (w == v) {
        return ;
      }
      w = v ;
      // New threads have arrived ...
    }
    ASSERT ((w & (LOCKED|CONTENDED)) == (LOCKED|CONTENDED)) ;

    // Impose long-term fairness - anti-starvation
    // periodically select tail as successor
    // Bernoulli trial determines if we pass to tail.
    // All interior elements should be LOCKED|CONTENTED
    // The ultimate element is the displaced version number and is LOCKED only.
    // Runs in O(T) time where T is the depth of the stack
    if (Bernoulli (TailProbability)) {
      PushNode * Penult = nullptr ;
      PushNode * Tail   = reinterpret_cast<PushNode *>(w & ~TAGMASK) ;
      // traverse list to find penultimate
      for (;;) {
        uintptr_t nw = Tail->Next ;
        ASSERT (nw & LOCKED) ;
        PushNode * nxt =  reinterpret_cast<PushNode *>(nw & ~TAGMASK) ;
        if ((nw & TAGMASK) == LOCKED) break ;
        Penult = Tail ;
        Tail = nxt ;
      }
      if (Penult != nullptr) {
        // claim : At least two threads on the stack
        ASSERT ((uintptr_t(ISeqLock) & (LOCKED|CONTENDED)) == (LOCKED|CONTENDED)) ;
        ASSERT (Tail != nullptr) ;
        ASSERT ((uintptr_t(Penult->Next) & ~TAGMASK) == uintptr_t(Tail)) ;
        ASSERT ((uintptr_t(Tail->Next) & (LOCKED|CONTENDED)) == LOCKED) ;
        // remove tail from list and grant ownership to tail
        // Truncate list
        Penult->Next = Underlying(Tail->Next) ;
        Resume(Tail) ;
        return ;
      }
    }

    // There is at least one thread on the stack
    // While locked, the stack only grows : push-only
    // While the lock remains held, code in unlock should never observe
    // the stack shrink, or the top-of-stack change from LOCKED|CONTENDED to any other state.
    // It _is legal to observe transitions from LOCKED to LOCKED|CONTENDED.
    TryPop : (0) ;
    auto Head = reinterpret_cast<PushNode *> (w & ~TAGMASK) ;
    ASSERT (Head != nullptr) ;
    uintptr_t T2 = Head->Next ;
    const uintptr_t v = LCAS (&ISeqLock, w, T2) ;
    if (v == w) {
      // CAS was successful
      Resume(Head) ;
      return ;
    }

    // CAS failed; we raced and lost
    // inopportune interleaving -- concurrent interference and race
    // some other thread modified ISeqLock in the LD-CAS window above.
    // new threads have arrived in interim window
    ASSERT ((v & (LOCKED|CONTENDED)) == (LOCKED|CONTENDED)) ;

    if (SuccessionMode == STRICT) {
      // Force strict pedantic LIFO ordering.
      // Retry the "pop" operation.
      // unlock() is NOT constant-time for STRICT mode.
      w = v ;
      goto TryPop ;
    }

    ASSERT (SuccessionMode == LIBERAL) ;
    // CAS failed, so there must be 2 or more threads on the stack
    // Wake the 2nd thread.
    // unlock() runs in constant time with no loops but the admission
    // order is not strict LIFO.
    Head = reinterpret_cast<PushNode *> (v & ~TAGMASK) ;
    ASSERT (Head != nullptr) ;
    T2   = Head->Next ;
    ASSERT ((T2 & (LOCKED|CONTENDED)) == (LOCKED|CONTENDED)) ;
    const auto nxt = reinterpret_cast<PushNode *> (T2 & ~TAGMASK) ;
    ASSERT (nxt != nullptr) ;
    Head->Next = nxt->Next.load() ;
    Resume (nxt) ;
  }

  // Alternative concise usage: int v = SeqLock + [&]{....} ;
  // Allow lambda body to return values thru seqlock.
  auto operator+ (auto && fn) -> decltype(fn()) {
    Guard G (this) ;
    return fn() ;
  }

  void Write (std::function<void(void)> Activity) {
    Guard G (this) ;
    Activity() ;
  }

  // Encapsulated read operator that takes a reader-CS expressed as a lambda
  // This allows us to hide the policy code that reverts from
  // optimistic reads to pessimistic writes.
  void Read (std::function<void(void)> Activity) {
    // Optimistic-speculative phase
    int OptimisticAttempts = 5 ;
    for (;;) {
      if (--OptimisticAttempts < 0) break ;
      if (TryRead (Activity) == 0) return ;
    }

    // revert to pessimistic mutual exclusion - ensure eventual progress
    Write(Activity) ;
  }

  // TryRead() return values
  // 0 : success : observed values were consistent
  // 1 : aborted because writer was present at start of attempt
  // 2 : aborted because writer arrived during attempt
  int TryRead (std::function<void(void)> Activity) {
    const auto v = ReadVersion() ;
    if (IsLocked(v)) return 1 ;
    // speculative read attempt ...
    // The reader can see inconsistent values and is expected to
    // tolerate and behave gracefully.
    // Be extremely careful of dependent loads and pointers!
    // FENCE : LD | LD
    Activity() ;
    // FENCE : LD | LD
    if (Validate(v)) return 0 ;
    return 2 ;
  }
} ;

static_assert(sizeof(InflatableSeqLock) == sizeof(intptr_t), "invariant") ;
	// InflatableSeqLock
	// Copyright(C) 2016 Oracle and/or its affiliates
	// Dave Dice : https://blogs.oracle.com/dave
	//
	// Remarks:
	// * Implements composite writer mutex and seqlock in a single word-sized lock.
	// Allows optimistic reading and pessimistic writing.
	// Readers don't write to shared synchronization metadata, which helps avoid
	// coherence traffic. Readers must tolerate observing inconsistent state, however.
	// * Writer mutex is based on LIFO-CR lock from http://arxiv.org/abs/1511.06035.
	// * If 2 words were available then we'd simply use a (SeqLock,WriterLock) tuple.
	// The writerlock could be a classic LIFO-CR lock.
	// We'd update the SeqLock least significant bit to reflect the WriterLock state.
	// This can be accomplished with normal non-atomic accesses.
	// Additional fencing is required for those non-atomic SeqLock updates.
	// * Does not support reentrant locking -- nesting
	// The identity of lock owner is anonymous
	// * Optimistic readers that fail repeatedly on the same episode are
	// expected to fall back to pessimistic write locking to ensure progress.
	// * Because we're interested in footprint and density, a single-word lock will be
	// more vulnerable to false sharing for contended or promiscuous use cases.
	// * LIFO-CR properties :
	// + single-word lock
	// + succession by direct handoff
	// + aggressive self-deflating locks -- prompt deflation
	// + Uses Spin-then-park polite waiting ; no unbounded spinning
	// + provides local spinning in spin-then-park waiting
	// + Requires CAS in both lock and unlock fast paths
	// + Explicit lock-free stack of waiters
	// The stacks are immune to the ABA problem as access is single-consumer-multiple-producer.
	// The SC safety property derives from lock itself.
	// Only the lock owner can pop.
	// + When writer threads are waiting, the upper bits of the SeqLock field point to
	// the head (top) of a stack of waiting threads.
	// The set of waiting threads are linked together via on-stack "PushNode" elements.
	// + uses Bernoulli trials for long-term fairness
	// Occasionally select the tail of the stack for succession.
	// Unfair over the short-term but enforces long-term fairness.
	// A classic seqlock that uses the LSB as a test-and-set lock can exhibit
	// sustained long-term unfairness and starvation.
	// + Through happenstance, the current implementation happens to be
	// "thread oblivious" in the sense that thread T1 might acquire the lock
	// and thread T2 could subsequently release the lock.
	// * A viable alternative to the LIFO-CR lock is the following:
	// Yoshihiro Oyama, Kenjiro Taura, Akinori Yonezawa
	// Executing Parallel Programs with Synchronization Bottlenecks Efficiently
	// PDSIA 1999
	// * If necessary, a writer can access the displaced inflatable seqlock word
	// by traversing the lock stack. The displaced word is the terminal element.
	// * Lock word encoding = (Variant : CONTENDED : LOCKED)
	// Normal : V:0:0 V is Version
	// Locked : V:0:1 Writer is active
	// Contended : P:1:1 P is pointer to PushNode element
	// Illegal : *:0:1
	// CONTENDED=1 implies LOCKED=1
	// LOCKED=0 implies CONTENDED=0
	// CONTENDED=1 implies remainder contains pointer to PushNode
	// CONTENDED=0 implies remainder contains version
	// * The implementation is not strictly complaint C++ given the casting
	// of bit operations on pointers.
	// We assume the low order 2 bits of pointers are available for our use,
	// which in turn presumes alignment of the referents.
	// * Invariant:
	// The displaced version number resides at the tail of the stack -- (V:0:1)
	// * We assume a 64-bit environment with 62 bits for the seqlock.
	// Roll-over is not a practical concern.
	// * The implementation may batch together groups of writes under one seqlock "increment"
	// episode.
	//
	// Remarks specific to the implementation sketch:
	// * For the purposes of explication, we've implemented waiting via unbounded spinning.
	// It is trivial, however, to convert the code to use park-unpark facilities.
	// * The implementation is conservatively over-fenced with std::memory_order_seq_cst.
	// Using more efficient and relaxed fencing is left as an exercise for the reader.
	// * Potential thrown exceptions in the lambdas are ignored.
	// * Various bits of support infrastructure are assumed.
	// * To make the code a bit shorter I've used gotos.

	static const auto LCAS = [](auto ARef, auto Cmp, auto Set) {
	std::atomic_compare_exchange_strong (ARef, &Cmp, Set) ;
	return Cmp ;
	} ;

	class InflatableSeqLock {
	private:
	// ISeqLock tag and VERSION(1) encoding ...
	enum { LOCKED=1, CONTENDED=2, TAGMASK=3, VERSION1=4, } ;

	std::atomic<uintptr_t> ISeqLock {VERSION1} ;

	// Policies for succession ...
	static const enum {LIBERAL, STRICT} SuccessionMode {LIBERAL} ;

	static const double TailProbability = 0.0001 ;

	static_assert(sizeof(uintptr_t) == 8, "invariant") ;

	struct PushNode {
	std::atomic<uintptr_t> Next alignas(128) {0} ;
	std::atomic<int> Admit {0} ;
	} ;

	// Pass lock ownership to thread referenced by "t"
	// Unpark(t) as necessary
	void Resume (PushNode * t) {
	// FENCE LD-ST \|ST
	ASSERT (t->Admit == 0) ;
	t->Admit = 1 ;
	}

	public:

	// Standard RAII wrapper
	class Guard {
	private:
	InflatableSeqLock & I ;
	public:
	Guard (InflatableSeqLock & s) : I(s) { I.Lock() ; }
	Guard (InflatableSeqLock * s) : I(*s) { I.Lock() ; }
	~Guard() { I.Unlock() ; }
	Guard (const Guard &) = delete ;
	Guard & operator=(const Guard &) = delete ;
	} ;

	uintptr_t ReadVersion() { return ISeqLock ;}
	int IsLocked (uintptr_t v) { return (v & LOCKED) != 0; }
	int Validate (uintptr_t v) { return ((v & LOCKED) \| (v ^ ISeqLock)) == 0; }

	void Lock() {
	uintptr_t w = ISeqLock ;
	Again : (0) ;
	if ((w & LOCKED) == 0) {
	// Uncontended lock fast path ...
	// Locked == 0 implies Contended == 0
	ASSERT ((w & CONTENDED) == 0) ;
	const uintptr_t v = LCAS (&ISeqLock, w, w\|LOCKED) ;
	if (v == w) {
	return ;
	}
	w = v ;
	// CAS failed via remote interference; we lost the race
	// Inopportune interleaving
	goto Again ;
	}

	// Contended arrival ...
	// Push reference to TSelf onto arrival stack
	PushNode TSelf ;
	TSelf.Admit = 0 ;
	TSelf.Next = w ;
	ASSERT ((uintptr_t(&TSelf) & TAGMASK) == 0) ;
	const uintptr_t v = LCAS (&ISeqLock, w, uintptr_t(&TSelf)\|LOCKED\|CONTENDED) ;
	if (v != w) {
	w = v ;
	goto Again ;
	}

	// Contended waiting phase ...
	// Should park() here
	while (TSelf.Admit == 0) {
	Pause() ;
	}

	// FENCE: LD \| ST-LD
	ASSERT (IsLocked(ISeqLock)) ;
	}

	void Unlock () {
	// Consider: prefetch-for-write
	uintptr_t w = ISeqLock ;
	ASSERT (IsLocked(w)) ;
	if ((w & CONTENDED) == 0) {
	// Fast uncontended unlock path ...
	// Version number remains installed in ISeqLock
	// Transition V:0:1 --> V+1:0:0
	// Clear LOCKED and increment version field in ISeqLock
	// Stream of unlocked version numbers should be monotone ascending.
	uintptr_t v = LCAS (&ISeqLock, w, (w & ~TAGMASK)+VERSION1) ;
	if (w == v) {
	return ;
	}
	w = v ;
	// New threads have arrived ...
	}
	ASSERT ((w & (LOCKED\|CONTENDED)) == (LOCKED\|CONTENDED)) ;

	// Impose long-term fairness - anti-starvation
	// periodically select tail as successor
	// Bernoulli trial determines if we pass to tail.
	// All interior elements should be LOCKED\|CONTENTED
	// The ultimate element is the displaced version number and is LOCKED only.
	// Runs in O(T) time where T is the depth of the stack
	if (Bernoulli (TailProbability)) {
	PushNode * Penult = nullptr ;
	PushNode * Tail = reinterpret_cast<PushNode *>(w & ~TAGMASK) ;
	// traverse list to find penultimate
	for (;;) {
	uintptr_t nw = Tail->Next ;
	ASSERT (nw & LOCKED) ;
	PushNode * nxt = reinterpret_cast<PushNode *>(nw & ~TAGMASK) ;
	if ((nw & TAGMASK) == LOCKED) break ;
	Penult = Tail ;
	Tail = nxt ;
	}
	if (Penult != nullptr) {
	// claim : At least two threads on the stack
	ASSERT ((uintptr_t(ISeqLock) & (LOCKED\|CONTENDED)) == (LOCKED\|CONTENDED)) ;
	ASSERT (Tail != nullptr) ;
	ASSERT ((uintptr_t(Penult->Next) & ~TAGMASK) == uintptr_t(Tail)) ;
	ASSERT ((uintptr_t(Tail->Next) & (LOCKED\|CONTENDED)) == LOCKED) ;
	// remove tail from list and grant ownership to tail
	// Truncate list
	Penult->Next = Underlying(Tail->Next) ;
	Resume(Tail) ;
	return ;
	}
	}

	// There is at least one thread on the stack
	// While locked, the stack only grows : push-only
	// While the lock remains held, code in unlock should never observe
	// the stack shrink, or the top-of-stack change from LOCKED\|CONTENDED to any other state.
	// It _is legal to observe transitions from LOCKED to LOCKED\|CONTENDED.
	TryPop : (0) ;
	auto Head = reinterpret_cast<PushNode *> (w & ~TAGMASK) ;
	ASSERT (Head != nullptr) ;
	uintptr_t T2 = Head->Next ;
	const uintptr_t v = LCAS (&ISeqLock, w, T2) ;
	if (v == w) {
	// CAS was successful
	Resume(Head) ;
	return ;
	}

	// CAS failed; we raced and lost
	// inopportune interleaving -- concurrent interference and race
	// some other thread modified ISeqLock in the LD-CAS window above.
	// new threads have arrived in interim window
	ASSERT ((v & (LOCKED\|CONTENDED)) == (LOCKED\|CONTENDED)) ;

	if (SuccessionMode == STRICT) {
	// Force strict pedantic LIFO ordering.
	// Retry the "pop" operation.
	// unlock() is NOT constant-time for STRICT mode.
	w = v ;
	goto TryPop ;
	}

	ASSERT (SuccessionMode == LIBERAL) ;
	// CAS failed, so there must be 2 or more threads on the stack
	// Wake the 2nd thread.
	// unlock() runs in constant time with no loops but the admission
	// order is not strict LIFO.
	Head = reinterpret_cast<PushNode *> (v & ~TAGMASK) ;
	ASSERT (Head != nullptr) ;
	T2 = Head->Next ;
	ASSERT ((T2 & (LOCKED\|CONTENDED)) == (LOCKED\|CONTENDED)) ;
	const auto nxt = reinterpret_cast<PushNode *> (T2 & ~TAGMASK) ;
	ASSERT (nxt != nullptr) ;
	Head->Next = nxt->Next.load() ;
	Resume (nxt) ;
	}

	// Alternative concise usage: int v = SeqLock + [&]{....} ;
	// Allow lambda body to return values thru seqlock.
	auto operator+ (auto && fn) -> decltype(fn()) {
	Guard G (this) ;
	return fn() ;
	}

	void Write (std::function<void(void)> Activity) {
	Guard G (this) ;
	Activity() ;
	}

	// Encapsulated read operator that takes a reader-CS expressed as a lambda
	// This allows us to hide the policy code that reverts from
	// optimistic reads to pessimistic writes.
	void Read (std::function<void(void)> Activity) {
	// Optimistic-speculative phase
	int OptimisticAttempts = 5 ;
	for (;;) {
	if (--OptimisticAttempts < 0) break ;
	if (TryRead (Activity) == 0) return ;
	}

	// revert to pessimistic mutual exclusion - ensure eventual progress
	Write(Activity) ;
	}

	// TryRead() return values
	// 0 : success : observed values were consistent
	// 1 : aborted because writer was present at start of attempt
	// 2 : aborted because writer arrived during attempt
	int TryRead (std::function<void(void)> Activity) {
	const auto v = ReadVersion() ;
	if (IsLocked(v)) return 1 ;
	// speculative read attempt ...
	// The reader can see inconsistent values and is expected to
	// tolerate and behave gracefully.
	// Be extremely careful of dependent loads and pointers!
	// FENCE : LD \| LD
	Activity() ;
	// FENCE : LD \| LD
	if (Validate(v)) return 0 ;
	return 2 ;
	}
	} ;

	static_assert(sizeof(InflatableSeqLock) == sizeof(intptr_t), "invariant") ;