Skip to content

Instantly share code, notes, and snippets.

@strictlymike
Created April 22, 2017 15:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save strictlymike/c7495013a6e505f02d0f8e260b13435c to your computer and use it in GitHub Desktop.
Save strictlymike/c7495013a6e505f02d0f8e260b13435c to your computer and use it in GitHub Desktop.
Automated performance monitoring application diagnostics
/**
* @file
* A rough sketch of providing diagnostic information about to the state of an
* application relevant to diagnosing prolonged high CPU. When prolonged high
* CPU occurs, the high-level question that must be answered is:
*
* 1. What is the application doing?
* and,
* 2. Why aren't CPU throttling controls working? (if applicable)
*
* Question (1) can be answered by investigating where the application is
* spending its time, which can be learned using readily available OS APIs.
*
* Unfortunately, question (2) is likely to require application developers to
* expose application-specific instrumentation to expose the state of various
* counters and criteria used to make a decision to do additional work. Because
* this facility is being developed in the abstract (without any particular
* target in mind) and over a short development period (1 day), it will not
* address question (2).
*
* To address question (1), this application will obtain several pieces of
* information based on live artifacts inspected with OS APIs over several
* iterations:
*
* 1. What threads are running (identified by entry point)?
* 2. What is the CPU utilization of each thread?
* 3. What instructions are being executed in each thread?
*
* The information in (1) is the basis of (2) and (3). The information in (2)
* will allow engineers to identify which threads are using the most resources.
* And the information in (3) may allow engineers to identify specific
* subroutines or basic blocks that are executing repeatedly.
*
* Because threads may be created and destroyed at any time, this application
* must continually obtain (1) and then dispatch (2) and (3) against each
* thread. The result may be that new threads appear and disappear in the
* output of this program.
*
* Ideally, output would be a binary or XML file that could be loaded into a
* user interface for browsing the data in a coherent way. In this sample
* program, information will be collected over a number iterations and
* displayed on the console before terminating.
*/
#include <windows.h>
#include <Psapi.h>
#include <TlHelp32.h>
#include <stdio.h>
#include <vector>
#include <map>
#include <algorithm>
#include <numeric>
#pragma comment(lib, "psapi.lib")
/* TODO: remove these and remove/update any code brackted by or using them */
#define DEBUG (1)
#define DEBUG_CPUS (4)
/* Tweak away as desired - can move these to getopt() later */
/* Number of iterations over which to watch threads */
#define SAMPLES (100)
/* Number of msec between samples. This will not be used precisely because it
* is not necessary. Instead, a Sleep() of this duration will be issued between
* samples. */
#define SAMPLE_PERIOD_MSEC (500)
/* Leave these alone */
#define PID_INVALID (-1)
#define ThreadQuerySetWin32StartAddress (9)
#define STATUS_SUCCESS ((NTSTATUS)0x00000000L)
#define STATUS_ACCESS_IS_DENIED ((NTSTATUS)0x00000005L)
#define FILETIME_PER_SEC (10000000)
typedef NTSTATUS (WINAPI *NTQUERYINFOMATIONTHREAD_T)(
HANDLE,
LONG,
PVOID,
ULONG,
PULONG
);
struct ModSpec
{
std::string Path;
std::string FinalComponent; /* FS jargon - indicates filename */
ULONG_PTR Size;
ULONG_PTR Start;
ULONG_PTR End; /* = Start + Size */
};
typedef std::vector<struct ModSpec> ModVect;
struct IpSpec
{
ULONG_PTR Off; /* Instruction pointer e.g. EIP or RIP */
UINT Times; /* Occurrences */
std::string ModName; /* If applicable */
std::string ModOff; /* = Off - ModBase */
};
typedef std::map<ULONG_PTR, struct IpSpec> IPHistogram;
struct ThreadSpec
{
DWORD Tid;
LARGE_INTEGER FirstSeen; /* Raw QPC (ticks) not divided by QPF */
LARGE_INTEGER LastSeen; /* Raw QPC (ticks) not divided by QPF */
LARGE_INTEGER Ktime; /* From GetThreadTimes() FILETIME */
LARGE_INTEGER Utime; /* From GetThreadTimes() FILETIME */
bool ModInfoValid;
ULONG_PTR Entry; /* Thread entry point */
std::string ModName; /* If applicable */
ULONG_PTR EntryModOff; /* = Entry - ModBase */
/* TODO: Why do I choose to use simply float here, but double precision in
* the trigger application? Pick one. */
std::vector<float> Pcpu; /* With respect to a single CPU */
IPHistogram IpHist; /* Instruction Pointers */
};
typedef std::map<DWORD, struct ThreadSpec> ThreadByTid;
typedef std::vector<struct ThreadSpec> ThreadVect;
struct InspectGlobals
{
LARGE_INTEGER qpcFreq;
NTQUERYINFOMATIONTHREAD_T NtQueryInformationThread;
};
int InspectByHandle(HANDLE hProc, int pid);
int Usage(char *progname, FILE *out, int ret);
bool InitGlobals(struct InspectGlobals *g);
int InspectProcess(HANDLE hProc, int pid, ThreadByTid& threadhash);
bool PopulateModVect(HANDLE hProc, ModVect& modlist);
int EnumAndProfileThreads(DWORD pid, ThreadByTid& threadhash, ModVect& modlist);
bool FindMod(ModVect& modlist, ULONG_PTR entry, struct ModSpec& modtmp);
float CalcCpu(
LARGE_INTEGER& qpc,
LARGE_INTEGER& prevQpc,
LARGE_INTEGER& prevTime,
FILETIME& Ktimef,
FILETIME& Utimef
);
int DisplayInformation(ThreadByTid& threadhash);
bool Compare(struct ThreadSpec& l, struct ThreadSpec& r);
ULONG_PTR GetThreadIp(HANDLE hThread);
template <class T>
T Average(std::vector<T>& v)
{
return std::accumulate(v.begin(), v.end(), 0) / v.size();
}
struct InspectGlobals gbls;
int
main(int argc, char **argv)
{
int ret = 1;
int pid = PID_INVALID;
HANDLE hProc;
ThreadByTid ThreadHash;
if (2 == argc)
{
pid = atoi(argv[1]);
}
/* If pid == PID_INVALID then pid was never assigned or argv[1] was a
* negative number. If pid == 0, then argv[1] was either 0 or not a valid
* numeric argument. */
if (pid <= 0)
{
ret = Usage(argv[0], stderr, 1);
goto exit_main;
}
if (!InitGlobals(&gbls))
{
goto exit_main;
}
/* Minimum requirement for EnumProcessModules() which is used later in this
* program. We do not open the handle with PROCESS_ALL_ACCESS in favor of
* applying the principle of least privilege. */
hProc = OpenProcess(
PROCESS_QUERY_INFORMATION | PROCESS_VM_READ,
FALSE,
pid
);
if (NULL == hProc)
{
fprintf(stderr, "OpenProcess failed, GLE=%d\n", GetLastError());
goto exit_main;
}
ret = InspectProcess(hProc, pid, ThreadHash);
if (0 != ret)
{
goto exit_main;
}
ret = DisplayInformation(ThreadHash);
exit_main:
/* Free all resources */
if (NULL != hProc)
{
CloseHandle(hProc);
}
return ret;
}
int
Usage(char *progname, FILE *out, int ret)
{
fprintf(out, "Usage: %s <pid>\n", progname);
fprintf(out, "\n");
fprintf(out, "Where pid is the pid of the process to inspect\n");
return ret;
}
bool
InitGlobals(struct InspectGlobals *g)
{
HMODULE hNtdll;
BOOL Ok;
bool ret = true;
hNtdll = LoadLibrary("ntdll.dll");
if (hNtdll == INVALID_HANDLE_VALUE)
{
goto exit_InitGlobals;
}
g->NtQueryInformationThread = (NTQUERYINFOMATIONTHREAD_T) GetProcAddress(
LoadLibrary("ntdll.dll"),
"NtQueryInformationThread"
);
if (g->NtQueryInformationThread == NULL)
{
goto exit_InitGlobals;
}
Ok = QueryPerformanceFrequency(&g->qpcFreq);
if (!Ok)
{
goto exit_InitGlobals;
}
ret = true;
exit_InitGlobals:
return ret;
}
int
InspectProcess(HANDLE hProc, int pid, ThreadByTid& threadhash)
{
int ret = 1;
ModVect modlist;
printf("Inspecting pid %d\n", pid);
for (int n=0; n<SAMPLES; n++)
{
printf("Sample %d\n", n);
modlist.clear();
if (!PopulateModVect(hProc, modlist))
{
fprintf(stderr, "Failed to get process modules\n");
goto exit_InspectProcess;
}
EnumAndProfileThreads(pid, threadhash, modlist);
Sleep(SAMPLE_PERIOD_MSEC);
}
ret = 0;
exit_InspectProcess:
return ret;
}
bool
PopulateModVect(HANDLE hProc, ModVect& modlist)
{
BOOL Ok;
bool ret = false;
HMODULE *Mods = NULL;
DWORD ModsSize = 0;
DWORD ModsSizeNeeded = 0;
int ModCount;
TCHAR szModName[MAX_PATH];
/* However unlikely it may be, if I change the size of szModName,
* ModNameAvail (below) ensures that the buffer length I pass to
* GetModuleFileNameEx varies accordingly. */
const DWORD ModNameAvail = sizeof(szModName) / sizeof(TCHAR);
MODULEINFO ModInfo;
/* EnumProcessModules loop: First iteration will allocate nothing and
* discover how much space is needed; subsequent iterations will allocate
* @Needed and check if that was sufficient.
*/
do
{
if (NULL != Mods)
{
delete [] Mods;
}
if (ModsSizeNeeded > 0)
{
delete [] Mods;
Mods = new HMODULE[ModsSizeNeeded];
if (NULL == Mods)
{
goto exit_PopulateModVect;
}
ModsSize = ModsSizeNeeded;
}
Ok = EnumProcessModules(hProc, Mods, ModsSize, &ModsSizeNeeded);
if (!Ok)
{
goto exit_PopulateModVect;
}
}
while (ModsSizeNeeded > ModsSize);
ModCount = ModsSize / sizeof(MODULEINFO);
for (int i=0; i<ModCount; i++)
{
ModSpec ms;
Ok = GetModuleFileNameEx(hProc, Mods[i], szModName, ModNameAvail);
if (!Ok)
{
/* TODO: Better logic may be to add a module with no name (base,
* size, and end only) and continue in the loop. Because such a
* condition seems unusual, the necessity of implementing such
* logic would depend on the binary that is targeted for
* instrumentation. */
goto exit_PopulateModVect;
}
Ok = GetModuleInformation(hProc, Mods[i], &ModInfo, sizeof(ModInfo));
if (!Ok)
{
goto exit_PopulateModVect;
}
/* I suspect string is declared with _Elem = char even if _UNICODE and
* UNICODE are defined, therefore this might actually require the use
* of basic_string specifying _Elem = TCHAR in order to truly work for
* Unicode. This poc will assume the target application has loaded
* modules only having ANSI names and that UNICODE is not used. */
ms.Path = szModName;
size_t idx = ms.Path.rfind('\\');
/* Although a module name is very unlikely to have a trailing slash,
* that case is easily included as a protection against "shenanigans".
*/
if ((idx == std::string::npos) && (idx >= ms.Path.length()-1))
{
ms.FinalComponent = ms.Path;
}
else
{
ms.FinalComponent = ms.Path.substr(idx+1);
}
ms.Size = ModInfo.SizeOfImage;
ms.Start = (ULONG_PTR) ModInfo.lpBaseOfDll;
ms.End = (ULONG_PTR) ModInfo.lpBaseOfDll + ModInfo.SizeOfImage;
modlist.push_back(ms);
}
ret = true;
exit_PopulateModVect:
if (NULL != Mods)
{
delete [] Mods;
}
return ret;
}
/**
* @fn
* Adapted from:
* Traversing the Thread List [msdn]:
* https://msdn.microsoft.com/en-us/library/windows/desktop/ms686852%28v=vs.85%29.aspx
*/
int
EnumAndProfileThreads(DWORD pid, ThreadByTid& threadhash, ModVect& modlist)
{
LARGE_INTEGER qpc;
LARGE_INTEGER prevQpc;
LARGE_INTEGER prevKtime;
LARGE_INTEGER prevUtime;
LARGE_INTEGER prevTime;
BOOL Ok;
int ret = 1;
FILETIME Ctimef;
FILETIME Etimef;
FILETIME Ktimef;
FILETIME Utimef;
HANDLE hSnap = INVALID_HANDLE_VALUE;
HANDLE hThread = INVALID_HANDLE_VALUE;
DWORD Tid;
/* I conclude that the '32' part of the toolhelp API names is an
* anachronism because (1) there is no THREADENTRY64 and because (2) I have
* used these APIs to access information for 64-bit processes. However,
* since there is no API specific to 64-bit applications, I infer that the
* bit-ness of this diagnostic process may need to match the bit-ness of
* the target process that it is meant to diagnose, or otherwise that this
* diagnostic application must be cross-compiled if it is to be launched
* against applications of both architectures. This is already the case
* to begin with due to details such as pointer lengths. */
THREADENTRY32 te;
ULONG_PTR ThreadEntryAddr;
NTSTATUS Status;
ModSpec modtmp;
float Pcpu;
float PcpuTot;
/* Capture ticks at moment before creation of toolhelp snapshot. This will
* be recorded as the first/last time the thread was seen. */
Ok = QueryPerformanceCounter(&qpc);
if (!Ok)
{
goto exit_EnumAndProfileThreads;
}
/* The API will ignore the @pid parameter, so results must be filtered by
* the caller. @pid is included here to opportunistically limit the result
* size in the event that the API changes, etc. See:
* https://msdn.microsoft.com/en-us/library/windows/desktop/ms682489%28v=vs.85%29.aspx
*
* Specifically:
* "th32ProcessID [in]... The process identifier of the process to be
* included in the snapshot. This parameter can be zero to indicate the
* current process. This parameter is used when the TH32CS_SNAPHEAPLIST,
* TH32CS_SNAPMODULE, TH32CS_SNAPMODULE32, or TH32CS_SNAPALL value is
* specified. Otherwise, it is ignored and all processes are included in
* the snapshot."
*/
hSnap = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, pid);
if (hSnap == INVALID_HANDLE_VALUE)
{
goto exit_EnumAndProfileThreads;
}
te.dwSize = sizeof(te);
te.cntUsage = 0; /* Unused and always zero per THREADENTRY32 [msdn] */
Ok = Thread32First(hSnap, &te);
if (!Ok)
{
goto exit_EnumAndProfileThreads;
}
PcpuTot = 0.0f;
do
{
if (te.th32OwnerProcessID != pid)
{
continue;
}
Tid = te.th32ThreadID;
hThread = OpenThread(
THREAD_QUERY_INFORMATION | THREAD_SUSPEND_RESUME |
THREAD_GET_CONTEXT,
FALSE,
Tid
);
if (hThread == INVALID_HANDLE_VALUE)
{
goto exit_EnumAndProfileThreads;
}
Status = gbls.NtQueryInformationThread(
hThread,
ThreadQuerySetWin32StartAddress,
&ThreadEntryAddr,
sizeof(ThreadEntryAddr),
NULL
);
if (Status != STATUS_SUCCESS)
{
goto exit_EnumAndProfileThreads;
}
Ok = GetThreadTimes(hThread, &Ctimef, &Etimef, &Ktimef, &Utimef);
if (!Ok)
{
goto exit_EnumAndProfileThreads;
}
ULONG_PTR ip = GetThreadIp(hThread);
Ok = FindMod(modlist, ip, modtmp);
if (Ok)
{
printf(
"Tid = %d, Rip = 0x%p (%s+0x%p)\n",
Tid,
ip,
modtmp.FinalComponent.c_str(),
ip - modtmp.Start
);
}
else
{
printf("Tid = %d, Rip = 0x%p\n", Tid, ip);
}
ThreadSpec ts;
if (threadhash.find(Tid) == threadhash.end())
{
/* New thread specification */
ts.Tid = te.th32ThreadID;
ts.FirstSeen = qpc;
ts.LastSeen = qpc;
/* ts.Ktime <== LARGE_INTEGER(Ktimef) */
ts.Ktime.LowPart = Ktimef.dwLowDateTime;
ts.Ktime.HighPart = Ktimef.dwHighDateTime;
/* ts.Utime <== LARGE_INTEGER(Utimef) */
ts.Utime.LowPart = Utimef.dwLowDateTime;
ts.Utime.HighPart = Utimef.dwHighDateTime;
ts.Entry = ThreadEntryAddr;
/* FIXFIX: Mixing and matching bool and BOOL, oh the humanity */
Ok = FindMod(modlist, ThreadEntryAddr, modtmp);
if (Ok)
{
ts.ModInfoValid = true;
ts.ModName = modtmp.FinalComponent;
ts.EntryModOff = ts.Entry - modtmp.Start;
#if DEBUG
printf("%s+0x%p\n", ts.ModName.c_str(), ts.EntryModOff);
printf("\t%s starts at 0x%p\n", modtmp.FinalComponent.c_str(), modtmp.Start);
#endif
}
else
{
ts.ModInfoValid = false;
ts.ModName = "(unknown module)";
ts.EntryModOff = ts.Entry;
}
/* NOTE: Pcpu is indeterminate the first time a thread is
* encountered because there is no historical information for
* kernel or user time spent. */
/* TODO: IpHist.push_back(instr_ptr) is not implemented, and is
* pending DebugActiveProcess etc. support */
threadhash[Tid] = ts;
}
else
{
ThreadSpec& ts = threadhash[Tid];
/* Collect old data */
prevQpc = ts.LastSeen;
prevKtime = ts.Ktime;
prevUtime = ts.Utime;
prevTime.QuadPart = ts.Ktime.QuadPart + ts.Utime.QuadPart;
/* Entry, ModName, EntryModOff are static */
Pcpu = CalcCpu(qpc, prevQpc, prevTime, Ktimef, Utimef);
PcpuTot += Pcpu;
ts.Pcpu.push_back(Pcpu);
/* TODO: IpHist.push_back(instr_ptr) is not implemented, and is
* pending DebugActiveProcess etc. support */
/* Update */
ts.LastSeen = qpc;
ts.Ktime.LowPart = Ktimef.dwLowDateTime;
ts.Ktime.HighPart = Ktimef.dwHighDateTime;
ts.Utime.LowPart = Utimef.dwLowDateTime;
ts.Utime.HighPart = Utimef.dwHighDateTime;
}
} while (Thread32Next(hSnap, &te));
#if DEBUG
printf("PcpuTot/%d = %G%%\n", DEBUG_CPUS, PcpuTot/DEBUG_CPUS);
#endif
ret = 0;
exit_EnumAndProfileThreads:
return ret;
}
ULONG_PTR
GetThreadIp(HANDLE hThread)
{
ULONG_PTR ret = 0;
DWORD Susp = (DWORD)-1;
BOOL Ok;
CONTEXT ctx;
/* Because you cannot obtain a valid thread context from a running thread,
* the first step is to increment the suspend count for the thread. Note
* that the GetThreadContext() documentation explicitly states that you do
* not need to be debugging a thread to use GetThreadContext(), although
* that is a common usage. */
Susp = SuspendThread(hThread);
if (((DWORD)-1) == Susp)
{
goto exit_GetThreadIp;
}
ctx.ContextFlags = CONTEXT_CONTROL;
Ok = GetThreadContext(hThread, &ctx);
if (!Ok)
{
/* Includes logic for resuming the target thread if it was successfully
* suspended */
goto exit_GetThreadIp;
}
ret = (ULONG_PTR) ctx.Rip;
exit_GetThreadIp:
if (((DWORD)-1) != Susp)
{
ResumeThread(hThread);
}
return ret;
}
/**
* @fn
* This application might be more efficient if ModVect were a data structure
* that sorted its elements upon insertion. */
bool
FindMod(ModVect& modlist, ULONG_PTR entry, struct ModSpec& modtmp)
{
bool ret = false;
for (int i=0; i<modlist.size(); i++)
{
if ((entry >= modlist[i].Start) && (entry < modlist[i].End))
{
modtmp = modlist[i];
ret = true;
break;
}
}
return ret;
}
float
CalcCpu(
LARGE_INTEGER& qpc,
LARGE_INTEGER& prevQpc,
LARGE_INTEGER& prevTime,
FILETIME& Ktimef,
FILETIME& Utimef
)
{
LARGE_INTEGER Ktime;
LARGE_INTEGER Utime;
LARGE_INTEGER Time;
float dtUsed;
float dtElapsed;
Ktime.LowPart = Ktimef.dwLowDateTime;
Ktime.HighPart = Ktimef.dwHighDateTime;
Utime.LowPart = Utimef.dwLowDateTime;
Utime.HighPart = Utimef.dwHighDateTime;
Time.QuadPart = Utime.QuadPart + Ktime.QuadPart;
dtUsed = (float)(Time.QuadPart - prevTime.QuadPart) / FILETIME_PER_SEC;
dtElapsed = (float)(qpc.QuadPart - prevQpc.QuadPart) / gbls.qpcFreq.QuadPart;
return 100 * dtUsed / dtElapsed;
}
int
DisplayInformation(ThreadByTid& threadhash)
{
ThreadVect threadvect;
ThreadByTid::iterator itt;
ThreadVect::iterator itv;
for (itt=threadhash.begin(); itt != threadhash.end(); itt++)
{
threadvect.push_back(itt->second);
}
sort(threadvect.begin(), threadvect.end(), Compare);
for (itv=threadvect.begin(); itv != threadvect.end(); itv++)
{
if (itv->ModInfoValid)
{
printf("%G%%, %d, %s+0x%p\n",
Average(itv->Pcpu),
itv->Tid,
itv->ModName,
itv->EntryModOff
);
}
else
{
printf("%G%%, %d, 0x%p\n",
Average(itv->Pcpu),
itv->Tid,
itv->Entry
);
}
}
return 0;
}
bool
Compare(struct ThreadSpec& l, struct ThreadSpec& r)
{
float lPcpuAvg = 0.0f;
float rPcpuAvg = 0.0f;
lPcpuAvg = Average(l.Pcpu);
rPcpuAvg = Average(r.Pcpu);
return (lPcpuAvg < rPcpuAvg);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment