Kaldaien/event-wait timing and throtling

## event-wait timing and throtling
float
SK_Sched_ThreadContext::most_recent_wait_s::getRate (void)
{
  if (sequence > 0)
  {
    float ms =
      SK_DeltaPerfMS (
        SK_CurrentPerf ().QuadPart - (last_wait.QuadPart - start.QuadPart), 1
      );

    return ( ms / static_cast <float> (sequence) );
  }

  // Sequence just started
  return -1.0f;
}

typedef
DWORD (WINAPI *WaitForSingleObjectEx_pfn)(
  _In_ HANDLE hHandle,
  _In_ DWORD  dwMilliseconds,
  _In_ BOOL   bAlertable
);

WaitForSingleObjectEx_pfn
WaitForSingleObjectEx_Original = nullptr;

extern volatile LONG SK_POE2_Horses_Held;
extern volatile LONG SK_POE2_SMT_Assists;
extern volatile LONG SK_POE2_ThreadBoostsKilled;
extern          bool SK_POE2_FixUnityEmployment;
extern          bool SK_POE2_Stage2UnityCancer;
extern          bool SK_POE2_Stage3UnityCancer;

// Stage 4 and 5 terminal Unity cancer have yet
//   to be discovered--it's only a matter of time

// Hash-string compare the stupid / lazy way
#include <unordered_set>

DWORD
WINAPI
WaitForSingleObjectEx_Detour (
  _In_ HANDLE hHandle,
  _In_ DWORD  dwMilliseconds,
  _In_ BOOL   bAlertable )
{
  SK_TLS *pTLS =
    SK_TLS_Bottom ();

  if (bAlertable)
    InterlockedIncrement (&pTLS->scheduler.alert_waits);

  // Consider double-buffering this since the information
  //   is used almost exclusively by OHTER threads, and
  //     we have to do a synchronous copy the get at this
  //       thing without thread A murdering thread B.
  SK_Sched_ThreadContext::wait_record_s& scheduled_wait =
    (*pTLS->scheduler.objects_waited) [hHandle];

  scheduled_wait.calls++;

  if (dwMilliseconds == INFINITE)
    scheduled_wait.time = 0;//+= dwMilliseconds;

  LARGE_INTEGER liStart =
    SK_QueryPerf ();

  bool same_as_last_time =
    ( pTLS->scheduler.mru_wait.handle == hHandle );

  pTLS->scheduler.mru_wait.handle = hHandle;

  auto ret =
    WaitForSingleObjectEx_Original (
      hHandle, dwMilliseconds, bAlertable
    );

  // We're waiting on the same event as last time on this thread
  if ( same_as_last_time )
  {
    pTLS->scheduler.mru_wait.last_wait = liStart;
    pTLS->scheduler.mru_wait.sequence++;
  }

  // This thread found actual work and has stopped abusing the kernel
  //   waiting on the same always-signaled event; it can have its
  //     normal preemption behavior back  (briefly anyway).
  else
  {
    pTLS->scheduler.mru_wait.start     = liStart;
    pTLS->scheduler.mru_wait.last_wait = liStart;
    pTLS->scheduler.mru_wait.sequence  = 0;
  }

  if ( ret            == WAIT_OBJECT_0 && SK_POE2_FixUnityEmployment &&
       dwMilliseconds == INFINITE      && bAlertable == TRUE )
  {
    static const
      std::unordered_set < std::wstring >
          __working_set { L"Worker Thread" };

    // Not to be confused with the other thing
    bool hardly_working =
      __working_set.count (pTLS->debug.name) != 0;

    if ( SK_POE2_Stage3UnityCancer || hardly_working )
    {
      if (pTLS->scheduler.mru_wait.getRate () >= 0.02f)
      {
        // This turns preemption of threads in the same priority level off.
        //
        //    * Yes, TRUE means OFF.  Use this wrong and you will hurt
        //                              performance; just sayin'
        //
        if (pTLS->scheduler.mru_wait.preemptive == -1)
        {
          GetThreadPriorityBoost ( GetCurrentThread (),
                                   &pTLS->scheduler.mru_wait.preemptive );
        }

        if (pTLS->scheduler.mru_wait.preemptive != TRUE)
        {
          SetThreadPriorityBoost ( GetCurrentThread (), TRUE );
          InterlockedIncrement   (&SK_POE2_ThreadBoostsKilled);
        }

        //
        // (Everything below applies to the Unity unemployment office only)
        //
        if (hardly_working)
        {
          // Unity Worker Threads have special additional considrations to
          //   make them less of a pain in the ass for the kernel.
          //
          LARGE_INTEGER core_sleep_begin =
            SK_QueryPerf ();

          if (SK_DeltaPerfMS (liStart.QuadPart, 1) < 0.25)
          {
            if (SK_POE2_Stage2UnityCancer)
            {
              // Micro-sleep the core this thread is running on to try
              //   and salvage its logical (HyperThreaded) partner's
              //     ability to do work.
              //
              while (SK_DeltaPerfMS (core_sleep_begin.QuadPart, 1) < 0.00005)
              {
                InterlockedIncrement (&SK_POE2_SMT_Assists);

                // Very brief pause that is good for next to nothing
                //   aside from voluntarily giving up execution resources
                //     on this core's superscalar pipe and hoping the
                //       related Logical Processor can work more
                //         productively if we get out of the way.
                //
                YieldProcessor       (                    );
                //
                // ^^^ Literally does nothing, but an even less useful
                //       nothing if the processor does not support SMT.
                //
              }
            }

            InterlockedIncrement (&SK_POE2_Horses_Held);

          //SwitchToThread       (       );
            SleepEx              (1, TRUE);

            // 1 ms voluntary reschedule
            // =========================
            //
            //  Rate throttle Unity's bass-ackwards job dispatch
            //    because it cannot do this itself and will flood
            //      jobs whose only measurable effect on the universe
            //        is repeatedly going into and out of kernel-mode.
            //
            //   These jobs only use about 1% of their measured CPU time
            //     working on game-related tasks. The rest of the time
            //       they spend trolling the kernel by pushing all the
            //         elevator buttons and then going next-door and
            //           doing the same thing over there.
            //
            //    ==> Sombebody find these "workers" a more constructive
            //          hobby ... or better still ... an actual job?
            //
          };
        }
      }

      else
      {
        if (pTLS->scheduler.mru_wait.preemptive == -1)
        {
          GetThreadPriorityBoost ( GetCurrentThread (),
                                  &pTLS->scheduler.mru_wait.preemptive );
        }

        if (pTLS->scheduler.mru_wait.preemptive != FALSE)
        {
          SetThreadPriorityBoost (GetCurrentThread (), FALSE);
          InterlockedIncrement   (&SK_POE2_ThreadBoostsKilled);
        }
      }
    }
  }

  // They took our jobs!
  else if (pTLS->scheduler.mru_wait.preemptive != -1)
  {
    SetThreadPriorityBoost (
      GetCurrentThread (),
        pTLS->scheduler.mru_wait.preemptive );

    // Status Quo restored: Jobs nobody wants are back and have
    //   zero future relevance and should be ignored if possible.
    pTLS->scheduler.mru_wait.preemptive = -1;
  }

  return ret;
}

## tls-local scheduler data

class SK_Sched_ThreadContext
{
public:
    DWORD         priority      = THREAD_PRIORITY_NORMAL;
  //UINT          ideal_cpu     =             0;
    DWORD_PTR     affinity_mask = (DWORD_PTR)-1;
    bool          lock_affinity = false;
    bool          background_io = false;

volatile
    LONG          alert_waits   = 0;

    struct wait_record_s {
      LONG        calls = 0;
      LONG        time  = 0;
    };

    std::unordered_map <HANDLE, wait_record_s>*
                  objects_waited;

    struct most_recent_wait_s
    {
      HANDLE        handle;
      LARGE_INTEGER start;
      LARGE_INTEGER last_wait;
      LONG          sequence;
      BOOL          preemptive;

      float getRate (void);
    } mru_wait;
};
	float
	SK_Sched_ThreadContext::most_recent_wait_s::getRate (void)
	{
	if (sequence > 0)
	{
	float ms =
	SK_DeltaPerfMS (
	SK_CurrentPerf ().QuadPart - (last_wait.QuadPart - start.QuadPart), 1
	);

	return ( ms / static_cast <float> (sequence) );
	}

	// Sequence just started
	return -1.0f;
	}

	typedef
	DWORD (WINAPI *WaitForSingleObjectEx_pfn)(
	_In_ HANDLE hHandle,
	_In_ DWORD dwMilliseconds,
	_In_ BOOL bAlertable
	);

	WaitForSingleObjectEx_pfn
	WaitForSingleObjectEx_Original = nullptr;

	extern volatile LONG SK_POE2_Horses_Held;
	extern volatile LONG SK_POE2_SMT_Assists;
	extern volatile LONG SK_POE2_ThreadBoostsKilled;
	extern bool SK_POE2_FixUnityEmployment;
	extern bool SK_POE2_Stage2UnityCancer;
	extern bool SK_POE2_Stage3UnityCancer;

	// Stage 4 and 5 terminal Unity cancer have yet
	// to be discovered--it's only a matter of time

	// Hash-string compare the stupid / lazy way
	#include <unordered_set>

	DWORD
	WINAPI
	WaitForSingleObjectEx_Detour (
	_In_ HANDLE hHandle,
	_In_ DWORD dwMilliseconds,
	_In_ BOOL bAlertable )
	{
	SK_TLS *pTLS =
	SK_TLS_Bottom ();

	if (bAlertable)
	InterlockedIncrement (&pTLS->scheduler.alert_waits);

	// Consider double-buffering this since the information
	// is used almost exclusively by OHTER threads, and
	// we have to do a synchronous copy the get at this
	// thing without thread A murdering thread B.
	SK_Sched_ThreadContext::wait_record_s& scheduled_wait =
	(*pTLS->scheduler.objects_waited) [hHandle];

	scheduled_wait.calls++;

	if (dwMilliseconds == INFINITE)
	scheduled_wait.time = 0;//+= dwMilliseconds;

	LARGE_INTEGER liStart =
	SK_QueryPerf ();

	bool same_as_last_time =
	( pTLS->scheduler.mru_wait.handle == hHandle );

	pTLS->scheduler.mru_wait.handle = hHandle;

	auto ret =
	WaitForSingleObjectEx_Original (
	hHandle, dwMilliseconds, bAlertable
	);

	// We're waiting on the same event as last time on this thread
	if ( same_as_last_time )
	{
	pTLS->scheduler.mru_wait.last_wait = liStart;
	pTLS->scheduler.mru_wait.sequence++;
	}

	// This thread found actual work and has stopped abusing the kernel
	// waiting on the same always-signaled event; it can have its
	// normal preemption behavior back (briefly anyway).
	else
	{
	pTLS->scheduler.mru_wait.start = liStart;
	pTLS->scheduler.mru_wait.last_wait = liStart;
	pTLS->scheduler.mru_wait.sequence = 0;
	}

	if ( ret == WAIT_OBJECT_0 && SK_POE2_FixUnityEmployment &&
	dwMilliseconds == INFINITE && bAlertable == TRUE )
	{
	static const
	std::unordered_set < std::wstring >
	__working_set { L"Worker Thread" };

	// Not to be confused with the other thing
	bool hardly_working =
	__working_set.count (pTLS->debug.name) != 0;

	if ( SK_POE2_Stage3UnityCancer \|\| hardly_working )
	{
	if (pTLS->scheduler.mru_wait.getRate () >= 0.02f)
	{
	// This turns preemption of threads in the same priority level off.
	//
	// * Yes, TRUE means OFF. Use this wrong and you will hurt
	// performance; just sayin'
	//
	if (pTLS->scheduler.mru_wait.preemptive == -1)
	{
	GetThreadPriorityBoost ( GetCurrentThread (),
	&pTLS->scheduler.mru_wait.preemptive );
	}

	if (pTLS->scheduler.mru_wait.preemptive != TRUE)
	{
	SetThreadPriorityBoost ( GetCurrentThread (), TRUE );
	InterlockedIncrement (&SK_POE2_ThreadBoostsKilled);
	}

	//
	// (Everything below applies to the Unity unemployment office only)
	//
	if (hardly_working)
	{
	// Unity Worker Threads have special additional considrations to
	// make them less of a pain in the ass for the kernel.
	//
	LARGE_INTEGER core_sleep_begin =
	SK_QueryPerf ();

	if (SK_DeltaPerfMS (liStart.QuadPart, 1) < 0.25)
	{
	if (SK_POE2_Stage2UnityCancer)
	{
	// Micro-sleep the core this thread is running on to try
	// and salvage its logical (HyperThreaded) partner's
	// ability to do work.
	//
	while (SK_DeltaPerfMS (core_sleep_begin.QuadPart, 1) < 0.00005)
	{
	InterlockedIncrement (&SK_POE2_SMT_Assists);

	// Very brief pause that is good for next to nothing
	// aside from voluntarily giving up execution resources
	// on this core's superscalar pipe and hoping the
	// related Logical Processor can work more
	// productively if we get out of the way.
	//
	YieldProcessor ( );
	//
	// ^^^ Literally does nothing, but an even less useful
	// nothing if the processor does not support SMT.
	//
	}
	}

	InterlockedIncrement (&SK_POE2_Horses_Held);

	//SwitchToThread ( );
	SleepEx (1, TRUE);

	// 1 ms voluntary reschedule
	// =========================
	//
	// Rate throttle Unity's bass-ackwards job dispatch
	// because it cannot do this itself and will flood
	// jobs whose only measurable effect on the universe
	// is repeatedly going into and out of kernel-mode.
	//
	// These jobs only use about 1% of their measured CPU time
	// working on game-related tasks. The rest of the time
	// they spend trolling the kernel by pushing all the
	// elevator buttons and then going next-door and
	// doing the same thing over there.
	//
	// ==> Sombebody find these "workers" a more constructive
	// hobby ... or better still ... an actual job?
	//
	};
	}
	}

	else
	{
	if (pTLS->scheduler.mru_wait.preemptive == -1)
	{
	GetThreadPriorityBoost ( GetCurrentThread (),
	&pTLS->scheduler.mru_wait.preemptive );
	}

	if (pTLS->scheduler.mru_wait.preemptive != FALSE)
	{
	SetThreadPriorityBoost (GetCurrentThread (), FALSE);
	InterlockedIncrement (&SK_POE2_ThreadBoostsKilled);
	}
	}
	}
	}

	// They took our jobs!
	else if (pTLS->scheduler.mru_wait.preemptive != -1)
	{
	SetThreadPriorityBoost (
	GetCurrentThread (),
	pTLS->scheduler.mru_wait.preemptive );

	// Status Quo restored: Jobs nobody wants are back and have
	// zero future relevance and should be ignored if possible.
	pTLS->scheduler.mru_wait.preemptive = -1;
	}

	return ret;
	}

	class SK_Sched_ThreadContext
	{
	public:
	DWORD priority = THREAD_PRIORITY_NORMAL;
	//UINT ideal_cpu = 0;
	DWORD_PTR affinity_mask = (DWORD_PTR)-1;
	bool lock_affinity = false;
	bool background_io = false;

	volatile
	LONG alert_waits = 0;

	struct wait_record_s {
	LONG calls = 0;
	LONG time = 0;
	};

	std::unordered_map <HANDLE, wait_record_s>*
	objects_waited;

	struct most_recent_wait_s
	{
	HANDLE handle;
	LARGE_INTEGER start;
	LARGE_INTEGER last_wait;
	LONG sequence;
	BOOL preemptive;

	float getRate (void);
	} mru_wait;
	};