strictlymike/inspect.cpp

## inspect.cpp
/**
 * @file
 * A rough sketch of providing diagnostic information about to the state of an
 * application relevant to diagnosing prolonged high CPU. When prolonged high
 * CPU occurs, the high-level question that must be answered is:
 *
 * 	1. What is the application doing?
 * 		and,
 * 	2. Why aren't CPU throttling controls working? (if applicable)
 *
 * Question (1) can be answered by investigating where the application is
 * spending its time, which can be learned using readily available OS APIs.
 *
 * Unfortunately, question (2) is likely to require application developers to
 * expose application-specific instrumentation to expose the state of various
 * counters and criteria used to make a decision to do additional work. Because
 * this facility is being developed in the abstract (without any particular
 * target in mind) and over a short development period (1 day), it will not
 * address question (2).
 *
 * To address question (1), this application will obtain several pieces of
 * information based on live artifacts inspected with OS APIs over several
 * iterations:
 *
 * 	1. What threads are running (identified by entry point)?
 * 	2. What is the CPU utilization of each thread?
 * 	3. What instructions are being executed in each thread?
 *
 * The information in (1) is the basis of (2) and (3). The information in (2)
 * will allow engineers to identify which threads are using the most resources.
 * And the information in (3) may allow engineers to identify specific
 * subroutines or basic blocks that are executing repeatedly.
 *
 * Because threads may be created and destroyed at any time, this application
 * must continually obtain (1) and then dispatch (2) and (3) against each
 * thread. The result may be that new threads appear and disappear in the
 * output of this program.
 *
 * Ideally, output would be a binary or XML file that could be loaded into a
 * user interface for browsing the data in a coherent way. In this sample
 * program, information will be collected over a number iterations and
 * displayed on the console before terminating.
 */
#include <windows.h>
#include <Psapi.h>
#include <TlHelp32.h>

#include <stdio.h>

#include <vector>
#include <map>
#include <algorithm>
#include <numeric>

#pragma comment(lib, "psapi.lib")

/* TODO: remove these and remove/update any code brackted by or using them */
#define DEBUG							(1)
#define DEBUG_CPUS						(4)

/* Tweak away as desired - can move these to getopt() later */
/* Number of iterations over which to watch threads */
#define SAMPLES							(100)
/* Number of msec between samples. This will not be used precisely because it
 * is not necessary. Instead, a Sleep() of this duration will be issued between
 * samples. */
#define SAMPLE_PERIOD_MSEC				(500)

/* Leave these alone */
#define PID_INVALID						(-1)
#define ThreadQuerySetWin32StartAddress	(9)
#define STATUS_SUCCESS					((NTSTATUS)0x00000000L)
#define STATUS_ACCESS_IS_DENIED			((NTSTATUS)0x00000005L)
#define FILETIME_PER_SEC				(10000000)

typedef NTSTATUS (WINAPI *NTQUERYINFOMATIONTHREAD_T)(
	HANDLE,
	LONG,
	PVOID,
	ULONG,
	PULONG
   );

struct ModSpec
{
	std::string Path;
	std::string FinalComponent;	/* FS jargon - indicates filename */
	ULONG_PTR Size;
	ULONG_PTR Start;
	ULONG_PTR End;				/* = Start + Size */
};

typedef std::vector<struct ModSpec> ModVect;

struct IpSpec
{
	ULONG_PTR Off;				/* Instruction pointer e.g. EIP or RIP */
	UINT Times;					/* Occurrences */
	std::string ModName;		/* If applicable */
	std::string ModOff;			/* = Off - ModBase */
};

typedef std::map<ULONG_PTR, struct IpSpec> IPHistogram;

struct ThreadSpec
{
	DWORD Tid;
	LARGE_INTEGER FirstSeen;	/* Raw QPC (ticks) not divided by QPF */
	LARGE_INTEGER LastSeen; 	/* Raw QPC (ticks) not divided by QPF */
	LARGE_INTEGER Ktime;		/* From GetThreadTimes() FILETIME */
	LARGE_INTEGER Utime;		/* From GetThreadTimes() FILETIME */

	bool ModInfoValid;
	ULONG_PTR Entry;			/* Thread entry point */
	std::string ModName;		/* If applicable */
	ULONG_PTR EntryModOff;		/* = Entry - ModBase */
	/* TODO: Why do I choose to use simply float here, but double precision in
	 * the trigger application? Pick one. */
	std::vector<float> Pcpu;	/* With respect to a single CPU */
	IPHistogram IpHist;			/* Instruction Pointers */
};

typedef std::map<DWORD, struct ThreadSpec> ThreadByTid;
typedef std::vector<struct ThreadSpec> ThreadVect;

struct InspectGlobals
{
	LARGE_INTEGER qpcFreq;
	NTQUERYINFOMATIONTHREAD_T NtQueryInformationThread;
};

int InspectByHandle(HANDLE hProc, int pid);
int Usage(char *progname, FILE *out, int ret);
bool InitGlobals(struct InspectGlobals *g);
int InspectProcess(HANDLE hProc, int pid, ThreadByTid& threadhash);
bool PopulateModVect(HANDLE hProc, ModVect& modlist);
int EnumAndProfileThreads(DWORD pid, ThreadByTid& threadhash, ModVect& modlist);
bool FindMod(ModVect& modlist, ULONG_PTR entry, struct ModSpec& modtmp);
float CalcCpu(
	LARGE_INTEGER& qpc,
	LARGE_INTEGER& prevQpc,
	LARGE_INTEGER& prevTime,
	FILETIME& Ktimef,
	FILETIME& Utimef
   );
int DisplayInformation(ThreadByTid& threadhash);
bool Compare(struct ThreadSpec& l, struct ThreadSpec& r);
ULONG_PTR GetThreadIp(HANDLE hThread);

template <class T>
T Average(std::vector<T>& v)
{
	return std::accumulate(v.begin(), v.end(), 0) / v.size();
}

struct InspectGlobals gbls;

int
main(int argc, char **argv)
{
	int ret = 1;
	int pid = PID_INVALID;
	HANDLE hProc;
	ThreadByTid ThreadHash;

	if (2 == argc)
	{
		pid = atoi(argv[1]);
	}

	/* If pid == PID_INVALID then pid was never assigned or argv[1] was a
	 * negative number. If pid == 0, then argv[1] was either 0 or not a valid
	 * numeric argument. */
	if (pid <= 0)
	{
		ret = Usage(argv[0], stderr, 1);
		goto exit_main;
	}

	if (!InitGlobals(&gbls))
	{
		goto exit_main;
	}

	/* Minimum requirement for EnumProcessModules() which is used later in this
	 * program. We do not open the handle with PROCESS_ALL_ACCESS in favor of
	 * applying the principle of least privilege. */
	hProc = OpenProcess(
		PROCESS_QUERY_INFORMATION | PROCESS_VM_READ,
		FALSE,
		pid
	   );

	if (NULL == hProc)
	{
		fprintf(stderr, "OpenProcess failed, GLE=%d\n", GetLastError());
		goto exit_main;
	}

	ret = InspectProcess(hProc, pid, ThreadHash);
	if (0 != ret)
	{
		goto exit_main;
	}

	ret = DisplayInformation(ThreadHash);

exit_main:
	/* Free all resources */
	if (NULL != hProc)
	{
		CloseHandle(hProc);
	}

	return ret;
}

int
Usage(char *progname, FILE *out, int ret)
{
	fprintf(out, "Usage: %s <pid>\n", progname);
	fprintf(out, "\n");
	fprintf(out, "Where pid is the pid of the process to inspect\n");

	return ret;
}

bool
InitGlobals(struct InspectGlobals *g)
{
	HMODULE hNtdll;
	BOOL Ok;
	bool ret = true;

	hNtdll = LoadLibrary("ntdll.dll");
	if (hNtdll == INVALID_HANDLE_VALUE)
	{
		goto exit_InitGlobals;
	}

	g->NtQueryInformationThread = (NTQUERYINFOMATIONTHREAD_T) GetProcAddress(
		LoadLibrary("ntdll.dll"),
		"NtQueryInformationThread"
	   );

	if (g->NtQueryInformationThread == NULL)
	{
		goto exit_InitGlobals;
	}

	Ok = QueryPerformanceFrequency(&g->qpcFreq);
	if (!Ok)
	{
		goto exit_InitGlobals;
	}

	ret = true;

exit_InitGlobals:
	return ret;
}

int
InspectProcess(HANDLE hProc, int pid, ThreadByTid& threadhash)
{
	int ret = 1;
	ModVect modlist;

	printf("Inspecting pid %d\n", pid);

	for (int n=0; n<SAMPLES; n++)
	{
		printf("Sample %d\n", n);
		modlist.clear();
		if (!PopulateModVect(hProc, modlist))
		{
			fprintf(stderr, "Failed to get process modules\n");
			goto exit_InspectProcess;
		}

		EnumAndProfileThreads(pid, threadhash, modlist);

		Sleep(SAMPLE_PERIOD_MSEC);
	}

	ret = 0;

exit_InspectProcess:
	return ret;
}

bool
PopulateModVect(HANDLE hProc, ModVect& modlist)
{
	BOOL Ok;
	bool ret = false;
	HMODULE *Mods = NULL;
	DWORD ModsSize = 0;
	DWORD ModsSizeNeeded = 0;
	int ModCount;
	TCHAR szModName[MAX_PATH];
	/* However unlikely it may be, if I change the size of szModName,
	 * ModNameAvail (below) ensures that the buffer length I pass to
	 * GetModuleFileNameEx varies accordingly. */
	const DWORD ModNameAvail = sizeof(szModName) / sizeof(TCHAR);
	MODULEINFO ModInfo;

	/* EnumProcessModules loop: First iteration will allocate nothing and
	 * discover how much space is needed; subsequent iterations will allocate
	 * @Needed and check if that was sufficient.
	 */
	do
	{
		if (NULL != Mods)
		{
			delete [] Mods;
		}

		if (ModsSizeNeeded > 0)
		{
			delete [] Mods;
			Mods = new HMODULE[ModsSizeNeeded];
			if (NULL == Mods)
			{
				goto exit_PopulateModVect;
			}
			ModsSize = ModsSizeNeeded;
		}

		Ok = EnumProcessModules(hProc, Mods, ModsSize, &ModsSizeNeeded);
		if (!Ok)
		{
			goto exit_PopulateModVect;
		}
	}
	while (ModsSizeNeeded > ModsSize);

	ModCount = ModsSize / sizeof(MODULEINFO);

	for (int i=0; i<ModCount; i++)
	{
		ModSpec ms;

		Ok = GetModuleFileNameEx(hProc, Mods[i], szModName, ModNameAvail);
		if (!Ok)
		{
			/* TODO: Better logic may be to add a module with no name (base,
			 * size, and end only) and continue in the loop. Because such a
			 * condition seems unusual, the necessity of implementing such
			 * logic would depend on the binary that is targeted for
			 * instrumentation. */
			goto exit_PopulateModVect;
		}

		Ok = GetModuleInformation(hProc, Mods[i], &ModInfo, sizeof(ModInfo));
		if (!Ok)
		{
			goto exit_PopulateModVect;
		}

		/* I suspect string is declared with _Elem = char even if _UNICODE and
		 * UNICODE are defined, therefore this might actually require the use
		 * of basic_string specifying _Elem = TCHAR in order to truly work for
		 * Unicode. This poc will assume the target application has loaded
		 * modules only having ANSI names and that UNICODE is not used. */
		ms.Path = szModName;

		size_t idx = ms.Path.rfind('\\');

		/* Although a module name is very unlikely to have a trailing slash,
		 * that case is easily included as a protection against "shenanigans".
		 */
		if ((idx == std::string::npos) && (idx >= ms.Path.length()-1))
		{
			ms.FinalComponent = ms.Path;
		}
		else
		{
			ms.FinalComponent = ms.Path.substr(idx+1);
		}

		ms.Size = ModInfo.SizeOfImage;
		ms.Start = (ULONG_PTR) ModInfo.lpBaseOfDll;
		ms.End = (ULONG_PTR) ModInfo.lpBaseOfDll + ModInfo.SizeOfImage;

		modlist.push_back(ms);
	}

	ret = true;

exit_PopulateModVect:
	if (NULL != Mods)
	{
		delete [] Mods;
	}
	return ret;
}

/**
 * @fn
 * Adapted from:
 * Traversing the Thread List [msdn]:
 * https://msdn.microsoft.com/en-us/library/windows/desktop/ms686852%28v=vs.85%29.aspx
 */
int
EnumAndProfileThreads(DWORD pid, ThreadByTid& threadhash, ModVect& modlist)
{
	LARGE_INTEGER qpc;
	LARGE_INTEGER prevQpc;
	LARGE_INTEGER prevKtime;
	LARGE_INTEGER prevUtime;
	LARGE_INTEGER prevTime;
	BOOL Ok;
	int ret = 1;
	FILETIME Ctimef;
	FILETIME Etimef;
	FILETIME Ktimef;
	FILETIME Utimef;
	HANDLE hSnap = INVALID_HANDLE_VALUE;
	HANDLE hThread = INVALID_HANDLE_VALUE;
	DWORD Tid;
	/* I conclude that the '32' part of the toolhelp API names is an
	 * anachronism because (1) there is no THREADENTRY64 and because (2) I have
	 * used these APIs to access information for 64-bit processes. However,
	 * since there is no API specific to 64-bit applications, I infer that the
	 * bit-ness of this diagnostic process may need to match the bit-ness of
	 * the target process that it is meant to diagnose, or otherwise that this
	 * diagnostic application must be cross-compiled if it is to be launched
	 * against applications of both architectures. This is already the case
	 * to begin with due to details such as pointer lengths. */
	THREADENTRY32 te;
	ULONG_PTR ThreadEntryAddr;
	NTSTATUS Status;
	ModSpec modtmp;
	float Pcpu;
	float PcpuTot;

	/* Capture ticks at moment before creation of toolhelp snapshot. This will
	 * be recorded as the first/last time the thread was seen. */
	Ok = QueryPerformanceCounter(&qpc);
	if (!Ok)
	{
		goto exit_EnumAndProfileThreads;
	}

	/* The API will ignore the @pid parameter, so results must be filtered by
	 * the caller. @pid is included here to opportunistically limit the result
	 * size in the event that the API changes, etc. See:
	 * https://msdn.microsoft.com/en-us/library/windows/desktop/ms682489%28v=vs.85%29.aspx
	 *
	 * Specifically:
	 * "th32ProcessID [in]... The process identifier of the process to be
	 * included in the snapshot.  This parameter can be zero to indicate the
	 * current process. This parameter is used when the TH32CS_SNAPHEAPLIST,
	 * TH32CS_SNAPMODULE, TH32CS_SNAPMODULE32, or TH32CS_SNAPALL value is
	 * specified. Otherwise, it is ignored and all processes are included in
	 * the snapshot."
	 */
	hSnap = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, pid);
	if (hSnap == INVALID_HANDLE_VALUE)
	{
		goto exit_EnumAndProfileThreads;
	}

	te.dwSize = sizeof(te);
	te.cntUsage = 0; /* Unused and always zero per THREADENTRY32 [msdn] */

	Ok = Thread32First(hSnap, &te);
	if (!Ok)
	{
		goto exit_EnumAndProfileThreads;
	}

	PcpuTot = 0.0f;

	do
	{
		if (te.th32OwnerProcessID != pid)
		{
			continue;
		}

		Tid = te.th32ThreadID;

		hThread = OpenThread(
			THREAD_QUERY_INFORMATION | THREAD_SUSPEND_RESUME |
			THREAD_GET_CONTEXT,
			FALSE,
			Tid
		   );
		if (hThread == INVALID_HANDLE_VALUE)
		{
			goto exit_EnumAndProfileThreads;
		}

		Status = gbls.NtQueryInformationThread(
			hThread,
			ThreadQuerySetWin32StartAddress,
			&ThreadEntryAddr,
			sizeof(ThreadEntryAddr),
			NULL
		   );
		if (Status != STATUS_SUCCESS)
		{
			goto exit_EnumAndProfileThreads;
		}

		Ok = GetThreadTimes(hThread, &Ctimef, &Etimef, &Ktimef, &Utimef);
		if (!Ok)
		{
			goto exit_EnumAndProfileThreads;
		}

		ULONG_PTR ip = GetThreadIp(hThread);
		Ok = FindMod(modlist, ip, modtmp);
		if (Ok)
		{
			printf(
				"Tid = %d, Rip = 0x%p (%s+0x%p)\n",
				Tid,
				ip,
				modtmp.FinalComponent.c_str(),
				ip - modtmp.Start
			   );
		}
		else
		{
			printf("Tid = %d, Rip = 0x%p\n", Tid, ip);
		}

		ThreadSpec ts;

		if (threadhash.find(Tid) == threadhash.end())
		{
			/* New thread specification */
			ts.Tid = te.th32ThreadID;
			ts.FirstSeen = qpc;
			ts.LastSeen = qpc;

			/* ts.Ktime <== LARGE_INTEGER(Ktimef) */
			ts.Ktime.LowPart = Ktimef.dwLowDateTime;
			ts.Ktime.HighPart = Ktimef.dwHighDateTime;

			/* ts.Utime <== LARGE_INTEGER(Utimef) */
			ts.Utime.LowPart = Utimef.dwLowDateTime;
			ts.Utime.HighPart = Utimef.dwHighDateTime;

			ts.Entry = ThreadEntryAddr;

			/* FIXFIX: Mixing and matching bool and BOOL, oh the humanity */
			Ok = FindMod(modlist, ThreadEntryAddr, modtmp);
			if (Ok)
			{
				ts.ModInfoValid = true;
				ts.ModName = modtmp.FinalComponent;
				ts.EntryModOff = ts.Entry - modtmp.Start;
#if DEBUG
				printf("%s+0x%p\n", ts.ModName.c_str(), ts.EntryModOff);
				printf("\t%s starts at 0x%p\n", modtmp.FinalComponent.c_str(), modtmp.Start);
#endif
			}
			else
			{
				ts.ModInfoValid = false;
				ts.ModName = "(unknown module)";
				ts.EntryModOff = ts.Entry;
			}

			/* NOTE: Pcpu is indeterminate the first time a thread is
			 * encountered because there is no historical information for
			 * kernel or user time spent. */

			/* TODO: IpHist.push_back(instr_ptr) is not implemented, and is
			 * pending DebugActiveProcess etc. support */

			threadhash[Tid] = ts;
		}
		else
		{
			ThreadSpec& ts = threadhash[Tid];

			/* Collect old data */
			prevQpc = ts.LastSeen;
			prevKtime = ts.Ktime;
			prevUtime = ts.Utime;
			prevTime.QuadPart = ts.Ktime.QuadPart + ts.Utime.QuadPart;

			/* Entry, ModName, EntryModOff are static */

			Pcpu = CalcCpu(qpc, prevQpc, prevTime, Ktimef, Utimef);
			PcpuTot += Pcpu;
			ts.Pcpu.push_back(Pcpu);

			/* TODO: IpHist.push_back(instr_ptr) is not implemented, and is
			 * pending DebugActiveProcess etc. support */

			/* Update */
			ts.LastSeen = qpc;
			ts.Ktime.LowPart = Ktimef.dwLowDateTime;
			ts.Ktime.HighPart = Ktimef.dwHighDateTime;
			ts.Utime.LowPart = Utimef.dwLowDateTime;
			ts.Utime.HighPart = Utimef.dwHighDateTime;
		}

	} while (Thread32Next(hSnap, &te));

#if DEBUG
	printf("PcpuTot/%d = %G%%\n", DEBUG_CPUS, PcpuTot/DEBUG_CPUS);
#endif

	ret = 0;

exit_EnumAndProfileThreads:
	return ret;
}

ULONG_PTR
GetThreadIp(HANDLE hThread)
{
	ULONG_PTR ret = 0;
	DWORD Susp = (DWORD)-1;
	BOOL Ok;
	CONTEXT ctx;

	/* Because you cannot obtain a valid thread context from a running thread,
	 * the first step is to increment the suspend count for the thread. Note
	 * that the GetThreadContext() documentation explicitly states that you do
	 * not need to be debugging a thread to use GetThreadContext(), although
	 * that is a common usage. */
	Susp = SuspendThread(hThread);
	if (((DWORD)-1) == Susp)
	{
		goto exit_GetThreadIp;
	}

	ctx.ContextFlags = CONTEXT_CONTROL;
	Ok = GetThreadContext(hThread, &ctx);
	if (!Ok)
	{
		/* Includes logic for resuming the target thread if it was successfully
		 * suspended */
		goto exit_GetThreadIp;
	}

	ret = (ULONG_PTR) ctx.Rip;

exit_GetThreadIp:
	if (((DWORD)-1) != Susp)
	{
		ResumeThread(hThread);
	}

	return ret;
}

/**
 * @fn
 * This application might be more efficient if ModVect were a data structure
 * that sorted its elements upon insertion. */
bool
FindMod(ModVect& modlist, ULONG_PTR entry, struct ModSpec& modtmp)
{
	bool ret = false;

	for (int i=0; i<modlist.size(); i++)
	{
		if ((entry >= modlist[i].Start) && (entry < modlist[i].End))
		{
			modtmp = modlist[i];
			ret = true;
			break;
		}
	}
	return ret;
}

float
CalcCpu(
	LARGE_INTEGER& qpc,
	LARGE_INTEGER& prevQpc,
	LARGE_INTEGER& prevTime,
	FILETIME& Ktimef,
	FILETIME& Utimef
   )
{
	LARGE_INTEGER Ktime;
	LARGE_INTEGER Utime;
	LARGE_INTEGER Time;
	float dtUsed;
	float dtElapsed;

	Ktime.LowPart = Ktimef.dwLowDateTime;
	Ktime.HighPart = Ktimef.dwHighDateTime;

	Utime.LowPart = Utimef.dwLowDateTime;
	Utime.HighPart = Utimef.dwHighDateTime;

	Time.QuadPart = Utime.QuadPart + Ktime.QuadPart;

	dtUsed = (float)(Time.QuadPart - prevTime.QuadPart) / FILETIME_PER_SEC;
	dtElapsed = (float)(qpc.QuadPart - prevQpc.QuadPart) / gbls.qpcFreq.QuadPart;

	return 100 * dtUsed / dtElapsed;
}

int
DisplayInformation(ThreadByTid& threadhash)
{
	ThreadVect threadvect;
	ThreadByTid::iterator itt;
	ThreadVect::iterator itv;

	for (itt=threadhash.begin(); itt != threadhash.end(); itt++)
	{
		threadvect.push_back(itt->second);
	}

	sort(threadvect.begin(), threadvect.end(), Compare);

	for (itv=threadvect.begin(); itv != threadvect.end(); itv++)
	{
		if (itv->ModInfoValid)
		{
			printf("%G%%, %d, %s+0x%p\n",
				Average(itv->Pcpu),
				itv->Tid,
				itv->ModName,
				itv->EntryModOff
			   );
		}
		else
		{
			printf("%G%%, %d, 0x%p\n",
				Average(itv->Pcpu),
				itv->Tid,
				itv->Entry
			   );
		}
	}

	return 0;
}

bool
Compare(struct ThreadSpec& l, struct ThreadSpec& r)
{
	float lPcpuAvg = 0.0f;
	float rPcpuAvg = 0.0f;

	lPcpuAvg = Average(l.Pcpu);
	rPcpuAvg = Average(r.Pcpu);

	return (lPcpuAvg < rPcpuAvg);
}
	/**
	* @file
	* A rough sketch of providing diagnostic information about to the state of an
	* application relevant to diagnosing prolonged high CPU. When prolonged high
	* CPU occurs, the high-level question that must be answered is:
	*
	* 1. What is the application doing?
	* and,
	* 2. Why aren't CPU throttling controls working? (if applicable)
	*
	* Question (1) can be answered by investigating where the application is
	* spending its time, which can be learned using readily available OS APIs.
	*
	* Unfortunately, question (2) is likely to require application developers to
	* expose application-specific instrumentation to expose the state of various
	* counters and criteria used to make a decision to do additional work. Because
	* this facility is being developed in the abstract (without any particular
	* target in mind) and over a short development period (1 day), it will not
	* address question (2).
	*
	* To address question (1), this application will obtain several pieces of
	* information based on live artifacts inspected with OS APIs over several
	* iterations:
	*
	* 1. What threads are running (identified by entry point)?
	* 2. What is the CPU utilization of each thread?
	* 3. What instructions are being executed in each thread?
	*
	* The information in (1) is the basis of (2) and (3). The information in (2)
	* will allow engineers to identify which threads are using the most resources.
	* And the information in (3) may allow engineers to identify specific
	* subroutines or basic blocks that are executing repeatedly.
	*
	* Because threads may be created and destroyed at any time, this application
	* must continually obtain (1) and then dispatch (2) and (3) against each
	* thread. The result may be that new threads appear and disappear in the
	* output of this program.
	*
	* Ideally, output would be a binary or XML file that could be loaded into a
	* user interface for browsing the data in a coherent way. In this sample
	* program, information will be collected over a number iterations and
	* displayed on the console before terminating.
	*/
	#include <windows.h>
	#include <Psapi.h>
	#include <TlHelp32.h>

	#include <stdio.h>

	#include <vector>
	#include <map>
	#include <algorithm>
	#include <numeric>

	#pragma comment(lib, "psapi.lib")

	/* TODO: remove these and remove/update any code brackted by or using them */
	#define DEBUG (1)
	#define DEBUG_CPUS (4)

	/* Tweak away as desired - can move these to getopt() later */
	/* Number of iterations over which to watch threads */
	#define SAMPLES (100)
	/* Number of msec between samples. This will not be used precisely because it
	* is not necessary. Instead, a Sleep() of this duration will be issued between
	* samples. */
	#define SAMPLE_PERIOD_MSEC (500)

	/* Leave these alone */
	#define PID_INVALID (-1)
	#define ThreadQuerySetWin32StartAddress (9)
	#define STATUS_SUCCESS ((NTSTATUS)0x00000000L)
	#define STATUS_ACCESS_IS_DENIED ((NTSTATUS)0x00000005L)
	#define FILETIME_PER_SEC (10000000)

	typedef NTSTATUS (WINAPI *NTQUERYINFOMATIONTHREAD_T)(
	HANDLE,
	LONG,
	PVOID,
	ULONG,
	PULONG
	);

	struct ModSpec
	{
	std::string Path;
	std::string FinalComponent; /* FS jargon - indicates filename */
	ULONG_PTR Size;
	ULONG_PTR Start;
	ULONG_PTR End; /* = Start + Size */
	};

	typedef std::vector<struct ModSpec> ModVect;

	struct IpSpec
	{
	ULONG_PTR Off; /* Instruction pointer e.g. EIP or RIP */
	UINT Times; /* Occurrences */
	std::string ModName; /* If applicable */
	std::string ModOff; /* = Off - ModBase */
	};

	typedef std::map<ULONG_PTR, struct IpSpec> IPHistogram;

	struct ThreadSpec
	{
	DWORD Tid;
	LARGE_INTEGER FirstSeen; /* Raw QPC (ticks) not divided by QPF */
	LARGE_INTEGER LastSeen; /* Raw QPC (ticks) not divided by QPF */
	LARGE_INTEGER Ktime; /* From GetThreadTimes() FILETIME */
	LARGE_INTEGER Utime; /* From GetThreadTimes() FILETIME */

	bool ModInfoValid;
	ULONG_PTR Entry; /* Thread entry point */
	std::string ModName; /* If applicable */
	ULONG_PTR EntryModOff; /* = Entry - ModBase */
	/* TODO: Why do I choose to use simply float here, but double precision in
	* the trigger application? Pick one. */
	std::vector<float> Pcpu; /* With respect to a single CPU */
	IPHistogram IpHist; /* Instruction Pointers */
	};

	typedef std::map<DWORD, struct ThreadSpec> ThreadByTid;
	typedef std::vector<struct ThreadSpec> ThreadVect;

	struct InspectGlobals
	{
	LARGE_INTEGER qpcFreq;
	NTQUERYINFOMATIONTHREAD_T NtQueryInformationThread;
	};

	int InspectByHandle(HANDLE hProc, int pid);
	int Usage(char progname, FILE out, int ret);
	bool InitGlobals(struct InspectGlobals *g);
	int InspectProcess(HANDLE hProc, int pid, ThreadByTid& threadhash);
	bool PopulateModVect(HANDLE hProc, ModVect& modlist);
	int EnumAndProfileThreads(DWORD pid, ThreadByTid& threadhash, ModVect& modlist);
	bool FindMod(ModVect& modlist, ULONG_PTR entry, struct ModSpec& modtmp);
	float CalcCpu(
	LARGE_INTEGER& qpc,
	LARGE_INTEGER& prevQpc,
	LARGE_INTEGER& prevTime,
	FILETIME& Ktimef,
	FILETIME& Utimef
	);
	int DisplayInformation(ThreadByTid& threadhash);
	bool Compare(struct ThreadSpec& l, struct ThreadSpec& r);
	ULONG_PTR GetThreadIp(HANDLE hThread);

	template <class T>
	T Average(std::vector<T>& v)
	{
	return std::accumulate(v.begin(), v.end(), 0) / v.size();
	}

	struct InspectGlobals gbls;

	int
	main(int argc, char **argv)
	{
	int ret = 1;
	int pid = PID_INVALID;
	HANDLE hProc;
	ThreadByTid ThreadHash;

	if (2 == argc)
	{
	pid = atoi(argv[1]);
	}

	/* If pid == PID_INVALID then pid was never assigned or argv[1] was a
	* negative number. If pid == 0, then argv[1] was either 0 or not a valid
	* numeric argument. */
	if (pid <= 0)
	{
	ret = Usage(argv[0], stderr, 1);
	goto exit_main;
	}

	if (!InitGlobals(&gbls))
	{
	goto exit_main;
	}

	/* Minimum requirement for EnumProcessModules() which is used later in this
	* program. We do not open the handle with PROCESS_ALL_ACCESS in favor of
	* applying the principle of least privilege. */
	hProc = OpenProcess(
	PROCESS_QUERY_INFORMATION \| PROCESS_VM_READ,
	FALSE,
	pid
	);

	if (NULL == hProc)
	{
	fprintf(stderr, "OpenProcess failed, GLE=%d\n", GetLastError());
	goto exit_main;
	}

	ret = InspectProcess(hProc, pid, ThreadHash);
	if (0 != ret)
	{
	goto exit_main;
	}

	ret = DisplayInformation(ThreadHash);

	exit_main:
	/* Free all resources */
	if (NULL != hProc)
	{
	CloseHandle(hProc);
	}

	return ret;
	}

	int
	Usage(char progname, FILE out, int ret)
	{
	fprintf(out, "Usage: %s <pid>\n", progname);
	fprintf(out, "\n");
	fprintf(out, "Where pid is the pid of the process to inspect\n");

	return ret;
	}

	bool
	InitGlobals(struct InspectGlobals *g)
	{
	HMODULE hNtdll;
	BOOL Ok;
	bool ret = true;

	hNtdll = LoadLibrary("ntdll.dll");
	if (hNtdll == INVALID_HANDLE_VALUE)
	{
	goto exit_InitGlobals;
	}

	g->NtQueryInformationThread = (NTQUERYINFOMATIONTHREAD_T) GetProcAddress(
	LoadLibrary("ntdll.dll"),
	"NtQueryInformationThread"
	);

	if (g->NtQueryInformationThread == NULL)
	{
	goto exit_InitGlobals;
	}

	Ok = QueryPerformanceFrequency(&g->qpcFreq);
	if (!Ok)
	{
	goto exit_InitGlobals;
	}

	ret = true;

	exit_InitGlobals:
	return ret;
	}

	int
	InspectProcess(HANDLE hProc, int pid, ThreadByTid& threadhash)
	{
	int ret = 1;
	ModVect modlist;

	printf("Inspecting pid %d\n", pid);

	for (int n=0; n<SAMPLES; n++)
	{
	printf("Sample %d\n", n);
	modlist.clear();
	if (!PopulateModVect(hProc, modlist))
	{
	fprintf(stderr, "Failed to get process modules\n");
	goto exit_InspectProcess;
	}

	EnumAndProfileThreads(pid, threadhash, modlist);

	Sleep(SAMPLE_PERIOD_MSEC);
	}

	ret = 0;

	exit_InspectProcess:
	return ret;
	}

	bool
	PopulateModVect(HANDLE hProc, ModVect& modlist)
	{
	BOOL Ok;
	bool ret = false;
	HMODULE *Mods = NULL;
	DWORD ModsSize = 0;
	DWORD ModsSizeNeeded = 0;
	int ModCount;
	TCHAR szModName[MAX_PATH];
	/* However unlikely it may be, if I change the size of szModName,
	* ModNameAvail (below) ensures that the buffer length I pass to
	* GetModuleFileNameEx varies accordingly. */
	const DWORD ModNameAvail = sizeof(szModName) / sizeof(TCHAR);
	MODULEINFO ModInfo;

	/* EnumProcessModules loop: First iteration will allocate nothing and
	* discover how much space is needed; subsequent iterations will allocate
	* @Needed and check if that was sufficient.
	*/
	do
	{
	if (NULL != Mods)
	{
	delete [] Mods;
	}

	if (ModsSizeNeeded > 0)
	{
	delete [] Mods;
	Mods = new HMODULE[ModsSizeNeeded];
	if (NULL == Mods)
	{
	goto exit_PopulateModVect;
	}
	ModsSize = ModsSizeNeeded;
	}

	Ok = EnumProcessModules(hProc, Mods, ModsSize, &ModsSizeNeeded);
	if (!Ok)
	{
	goto exit_PopulateModVect;
	}
	}
	while (ModsSizeNeeded > ModsSize);

	ModCount = ModsSize / sizeof(MODULEINFO);

	for (int i=0; i<ModCount; i++)
	{
	ModSpec ms;

	Ok = GetModuleFileNameEx(hProc, Mods[i], szModName, ModNameAvail);
	if (!Ok)
	{
	/* TODO: Better logic may be to add a module with no name (base,
	* size, and end only) and continue in the loop. Because such a
	* condition seems unusual, the necessity of implementing such
	* logic would depend on the binary that is targeted for
	* instrumentation. */
	goto exit_PopulateModVect;
	}

	Ok = GetModuleInformation(hProc, Mods[i], &ModInfo, sizeof(ModInfo));
	if (!Ok)
	{
	goto exit_PopulateModVect;
	}

	/* I suspect string is declared with _Elem = char even if _UNICODE and
	* UNICODE are defined, therefore this might actually require the use
	* of basic_string specifying _Elem = TCHAR in order to truly work for
	* Unicode. This poc will assume the target application has loaded
	* modules only having ANSI names and that UNICODE is not used. */
	ms.Path = szModName;

	size_t idx = ms.Path.rfind('\\');

	/* Although a module name is very unlikely to have a trailing slash,
	* that case is easily included as a protection against "shenanigans".
	*/
	if ((idx == std::string::npos) && (idx >= ms.Path.length()-1))
	{
	ms.FinalComponent = ms.Path;
	}
	else
	{
	ms.FinalComponent = ms.Path.substr(idx+1);
	}

	ms.Size = ModInfo.SizeOfImage;
	ms.Start = (ULONG_PTR) ModInfo.lpBaseOfDll;
	ms.End = (ULONG_PTR) ModInfo.lpBaseOfDll + ModInfo.SizeOfImage;

	modlist.push_back(ms);
	}

	ret = true;

	exit_PopulateModVect:
	if (NULL != Mods)
	{
	delete [] Mods;
	}
	return ret;
	}

	/**
	* @fn
	* Adapted from:
	* Traversing the Thread List [msdn]:
	* https://msdn.microsoft.com/en-us/library/windows/desktop/ms686852%28v=vs.85%29.aspx
	*/
	int
	EnumAndProfileThreads(DWORD pid, ThreadByTid& threadhash, ModVect& modlist)
	{
	LARGE_INTEGER qpc;
	LARGE_INTEGER prevQpc;
	LARGE_INTEGER prevKtime;
	LARGE_INTEGER prevUtime;
	LARGE_INTEGER prevTime;
	BOOL Ok;
	int ret = 1;
	FILETIME Ctimef;
	FILETIME Etimef;
	FILETIME Ktimef;
	FILETIME Utimef;
	HANDLE hSnap = INVALID_HANDLE_VALUE;
	HANDLE hThread = INVALID_HANDLE_VALUE;
	DWORD Tid;
	/* I conclude that the '32' part of the toolhelp API names is an
	* anachronism because (1) there is no THREADENTRY64 and because (2) I have
	* used these APIs to access information for 64-bit processes. However,
	* since there is no API specific to 64-bit applications, I infer that the
	* bit-ness of this diagnostic process may need to match the bit-ness of
	* the target process that it is meant to diagnose, or otherwise that this
	* diagnostic application must be cross-compiled if it is to be launched
	* against applications of both architectures. This is already the case
	* to begin with due to details such as pointer lengths. */
	THREADENTRY32 te;
	ULONG_PTR ThreadEntryAddr;
	NTSTATUS Status;
	ModSpec modtmp;
	float Pcpu;
	float PcpuTot;

	/* Capture ticks at moment before creation of toolhelp snapshot. This will
	* be recorded as the first/last time the thread was seen. */
	Ok = QueryPerformanceCounter(&qpc);
	if (!Ok)
	{
	goto exit_EnumAndProfileThreads;
	}

	/* The API will ignore the @pid parameter, so results must be filtered by
	* the caller. @pid is included here to opportunistically limit the result
	* size in the event that the API changes, etc. See:
	* https://msdn.microsoft.com/en-us/library/windows/desktop/ms682489%28v=vs.85%29.aspx
	*
	* Specifically:
	* "th32ProcessID [in]... The process identifier of the process to be
	* included in the snapshot. This parameter can be zero to indicate the
	* current process. This parameter is used when the TH32CS_SNAPHEAPLIST,
	* TH32CS_SNAPMODULE, TH32CS_SNAPMODULE32, or TH32CS_SNAPALL value is
	* specified. Otherwise, it is ignored and all processes are included in
	* the snapshot."
	*/
	hSnap = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, pid);
	if (hSnap == INVALID_HANDLE_VALUE)
	{
	goto exit_EnumAndProfileThreads;
	}

	te.dwSize = sizeof(te);
	te.cntUsage = 0; /* Unused and always zero per THREADENTRY32 [msdn] */

	Ok = Thread32First(hSnap, &te);
	if (!Ok)
	{
	goto exit_EnumAndProfileThreads;
	}

	PcpuTot = 0.0f;

	do
	{
	if (te.th32OwnerProcessID != pid)
	{
	continue;
	}

	Tid = te.th32ThreadID;

	hThread = OpenThread(
	THREAD_QUERY_INFORMATION \| THREAD_SUSPEND_RESUME \|
	THREAD_GET_CONTEXT,
	FALSE,
	Tid
	);
	if (hThread == INVALID_HANDLE_VALUE)
	{
	goto exit_EnumAndProfileThreads;
	}

	Status = gbls.NtQueryInformationThread(
	hThread,
	ThreadQuerySetWin32StartAddress,
	&ThreadEntryAddr,
	sizeof(ThreadEntryAddr),
	NULL
	);
	if (Status != STATUS_SUCCESS)
	{
	goto exit_EnumAndProfileThreads;
	}

	Ok = GetThreadTimes(hThread, &Ctimef, &Etimef, &Ktimef, &Utimef);
	if (!Ok)
	{
	goto exit_EnumAndProfileThreads;
	}

	ULONG_PTR ip = GetThreadIp(hThread);
	Ok = FindMod(modlist, ip, modtmp);
	if (Ok)
	{
	printf(
	"Tid = %d, Rip = 0x%p (%s+0x%p)\n",
	Tid,
	ip,
	modtmp.FinalComponent.c_str(),
	ip - modtmp.Start
	);
	}
	else
	{
	printf("Tid = %d, Rip = 0x%p\n", Tid, ip);
	}

	ThreadSpec ts;

	if (threadhash.find(Tid) == threadhash.end())
	{
	/* New thread specification */
	ts.Tid = te.th32ThreadID;
	ts.FirstSeen = qpc;
	ts.LastSeen = qpc;

	/* ts.Ktime <== LARGE_INTEGER(Ktimef) */
	ts.Ktime.LowPart = Ktimef.dwLowDateTime;
	ts.Ktime.HighPart = Ktimef.dwHighDateTime;

	/* ts.Utime <== LARGE_INTEGER(Utimef) */
	ts.Utime.LowPart = Utimef.dwLowDateTime;
	ts.Utime.HighPart = Utimef.dwHighDateTime;

	ts.Entry = ThreadEntryAddr;

	/* FIXFIX: Mixing and matching bool and BOOL, oh the humanity */
	Ok = FindMod(modlist, ThreadEntryAddr, modtmp);
	if (Ok)
	{
	ts.ModInfoValid = true;
	ts.ModName = modtmp.FinalComponent;
	ts.EntryModOff = ts.Entry - modtmp.Start;
	#if DEBUG
	printf("%s+0x%p\n", ts.ModName.c_str(), ts.EntryModOff);
	printf("\t%s starts at 0x%p\n", modtmp.FinalComponent.c_str(), modtmp.Start);
	#endif
	}
	else
	{
	ts.ModInfoValid = false;
	ts.ModName = "(unknown module)";
	ts.EntryModOff = ts.Entry;
	}

	/* NOTE: Pcpu is indeterminate the first time a thread is
	* encountered because there is no historical information for
	* kernel or user time spent. */

	/* TODO: IpHist.push_back(instr_ptr) is not implemented, and is
	* pending DebugActiveProcess etc. support */

	threadhash[Tid] = ts;
	}
	else
	{
	ThreadSpec& ts = threadhash[Tid];

	/* Collect old data */
	prevQpc = ts.LastSeen;
	prevKtime = ts.Ktime;
	prevUtime = ts.Utime;
	prevTime.QuadPart = ts.Ktime.QuadPart + ts.Utime.QuadPart;

	/* Entry, ModName, EntryModOff are static */

	Pcpu = CalcCpu(qpc, prevQpc, prevTime, Ktimef, Utimef);
	PcpuTot += Pcpu;
	ts.Pcpu.push_back(Pcpu);

	/* TODO: IpHist.push_back(instr_ptr) is not implemented, and is
	* pending DebugActiveProcess etc. support */

	/* Update */
	ts.LastSeen = qpc;
	ts.Ktime.LowPart = Ktimef.dwLowDateTime;
	ts.Ktime.HighPart = Ktimef.dwHighDateTime;
	ts.Utime.LowPart = Utimef.dwLowDateTime;
	ts.Utime.HighPart = Utimef.dwHighDateTime;
	}

	} while (Thread32Next(hSnap, &te));

	#if DEBUG
	printf("PcpuTot/%d = %G%%\n", DEBUG_CPUS, PcpuTot/DEBUG_CPUS);
	#endif

	ret = 0;

	exit_EnumAndProfileThreads:
	return ret;
	}

	ULONG_PTR
	GetThreadIp(HANDLE hThread)
	{
	ULONG_PTR ret = 0;
	DWORD Susp = (DWORD)-1;
	BOOL Ok;
	CONTEXT ctx;

	/* Because you cannot obtain a valid thread context from a running thread,
	* the first step is to increment the suspend count for the thread. Note
	* that the GetThreadContext() documentation explicitly states that you do
	* not need to be debugging a thread to use GetThreadContext(), although
	* that is a common usage. */
	Susp = SuspendThread(hThread);
	if (((DWORD)-1) == Susp)
	{
	goto exit_GetThreadIp;
	}

	ctx.ContextFlags = CONTEXT_CONTROL;
	Ok = GetThreadContext(hThread, &ctx);
	if (!Ok)
	{
	/* Includes logic for resuming the target thread if it was successfully
	* suspended */
	goto exit_GetThreadIp;
	}

	ret = (ULONG_PTR) ctx.Rip;

	exit_GetThreadIp:
	if (((DWORD)-1) != Susp)
	{
	ResumeThread(hThread);
	}

	return ret;
	}

	/**
	* @fn
	* This application might be more efficient if ModVect were a data structure
	* that sorted its elements upon insertion. */
	bool
	FindMod(ModVect& modlist, ULONG_PTR entry, struct ModSpec& modtmp)
	{
	bool ret = false;

	for (int i=0; i<modlist.size(); i++)
	{
	if ((entry >= modlist[i].Start) && (entry < modlist[i].End))
	{
	modtmp = modlist[i];
	ret = true;
	break;
	}
	}
	return ret;
	}

	float
	CalcCpu(
	LARGE_INTEGER& qpc,
	LARGE_INTEGER& prevQpc,
	LARGE_INTEGER& prevTime,
	FILETIME& Ktimef,
	FILETIME& Utimef
	)
	{
	LARGE_INTEGER Ktime;
	LARGE_INTEGER Utime;
	LARGE_INTEGER Time;
	float dtUsed;
	float dtElapsed;

	Ktime.LowPart = Ktimef.dwLowDateTime;
	Ktime.HighPart = Ktimef.dwHighDateTime;

	Utime.LowPart = Utimef.dwLowDateTime;
	Utime.HighPart = Utimef.dwHighDateTime;

	Time.QuadPart = Utime.QuadPart + Ktime.QuadPart;

	dtUsed = (float)(Time.QuadPart - prevTime.QuadPart) / FILETIME_PER_SEC;
	dtElapsed = (float)(qpc.QuadPart - prevQpc.QuadPart) / gbls.qpcFreq.QuadPart;

	return 100 * dtUsed / dtElapsed;
	}

	int
	DisplayInformation(ThreadByTid& threadhash)
	{
	ThreadVect threadvect;
	ThreadByTid::iterator itt;
	ThreadVect::iterator itv;

	for (itt=threadhash.begin(); itt != threadhash.end(); itt++)
	{
	threadvect.push_back(itt->second);
	}

	sort(threadvect.begin(), threadvect.end(), Compare);

	for (itv=threadvect.begin(); itv != threadvect.end(); itv++)
	{
	if (itv->ModInfoValid)
	{
	printf("%G%%, %d, %s+0x%p\n",
	Average(itv->Pcpu),
	itv->Tid,
	itv->ModName,
	itv->EntryModOff
	);
	}
	else
	{
	printf("%G%%, %d, 0x%p\n",
	Average(itv->Pcpu),
	itv->Tid,
	itv->Entry
	);
	}
	}

	return 0;
	}

	bool
	Compare(struct ThreadSpec& l, struct ThreadSpec& r)
	{
	float lPcpuAvg = 0.0f;
	float rPcpuAvg = 0.0f;

	lPcpuAvg = Average(l.Pcpu);
	rPcpuAvg = Average(r.Pcpu);

	return (lPcpuAvg < rPcpuAvg);
	}