Skip to content

Instantly share code, notes, and snippets.

@cleanbaja
Last active October 1, 2022 00:32
Show Gist options
  • Save cleanbaja/b9209a9b1c5fb3a9df243637d80670b6 to your computer and use it in GitHub Desktop.
Save cleanbaja/b9209a9b1c5fb3a9df243637d80670b6 to your computer and use it in GitHub Desktop.
XSAVE profiler & information dumper
/*++
Copyright (c) 2022 cleanbaja, All Rights Reserved.
Module Name:
xsave_test.cc
Abstract:
C++ module for profiling/dumping the XSAVE family of instructions.
Author:
Yusuf V. Hirsi (@cleanbaja)
Environment:
Windows or Unix-Compatible terminal
Notes:
A Context Switch might take place in-between the readings of the
internal cpu counter. To counter this, run this program multiple times
to smooth the readings out. Finally, CPU readings will always be
different across machines, due to the implmentation-specific nature of the XSAVE
instruction.
PS: Readings might also vary for different reasons, such as CPU p-states
or reordering of the TSC reads by the compiler/CPU.
--*/
#include <bitset>
#include <chrono>
#include <iostream>
#include <type_traits>
#include <vector>
// Various C headers for things that the C++ stdlib dosen't have,
// or C has a better implmentation of.
#include <cassert>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
// We use the 'CPUID' instruction to gather information about the FPU
#define CPUID_XSAVE_FEATURES 0xDU
static inline void cpuid(uint32_t leaf, uint32_t *eax, uint32_t *ebx,
uint32_t *ecx, uint32_t *edx) {
/* Execute CPUID instruction and save results */
asm volatile("cpuid"
: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
: "a"(leaf), "c"(0x0)
: "memory");
}
static inline void cpuid_subleaf(uint32_t leaf, uint32_t subleaf, uint32_t *eax,
uint32_t *ebx, uint32_t *ecx, uint32_t *edx) {
/* Execute CPUID instruction and save results */
asm volatile("cpuid"
: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
: "a"(leaf), "c"(subleaf)
: "memory");
}
// A class for determining the context size of the FPU
struct ContextInfo {
enum class SaveMode : uint8_t {
FPU_SAVE_XSAVE = 1,
FPU_SAVE_XSAVEOPT = 2,
FPU_SAVE_XSAVEC = 4
};
ContextInfo() {
static bool log = true;
// Find the proper instruction to use (assuming XSAVE)
uint32_t a = 0, b = 0, c = 0, d = 0;
cpuid_subleaf(CPUID_XSAVE_FEATURES, 1, &a, &b, &c, &d);
std::bitset<32> supported_bits{a};
sv = SaveMode::FPU_SAVE_XSAVE;
// Next, try the remaining three instructions
if (supported_bits.test(0)) {
sv = SaveMode::FPU_SAVE_XSAVEOPT;
supported_sv |= (int)SaveMode::FPU_SAVE_XSAVEOPT;
}
if (supported_bits.test(1)) {
sv = SaveMode::FPU_SAVE_XSAVEC;
supported_sv |= (int)SaveMode::FPU_SAVE_XSAVEC;
}
if (supported_bits.test(3)) {
if (log) {
std::cout
<< "xsave_test: XSAVES is available, ignoring since its a "
"supervisor instruction...\n";
}
}
// Finally, find the proper size to use...
cpuid(CPUID_XSAVE_FEATURES, &a, &b, &c, &d);
switch (sv) {
case SaveMode::FPU_SAVE_XSAVE:
this->ctx_size = c;
if (log) {
std::cout
<< "xsave_test: XSAVE preferred with a context size of "
<< c << "\n";
log = false;
}
break;
case SaveMode::FPU_SAVE_XSAVEOPT:
this->ctx_size = c;
if (log) {
std::cout
<< "xsave_test: XSAVEOPT preferred with a context size of "
<< c << "\n";
log = false;
}
break;
case SaveMode::FPU_SAVE_XSAVEC:
this->ctx_size = b;
if (log) {
std::cout
<< "xsave_test: XSAVEC preferred with a context size of "
<< b << "\n";
log = false;
}
break;
}
}
protected:
SaveMode sv;
uint8_t supported_sv;
size_t ctx_size;
};
class FpuInterface {
protected:
uint64_t __attribute__((aligned(64))) _context[1024] = {0};
bool right_mode;
public:
virtual void save() {}
virtual void restore() {}
virtual bool active() { return false; }
virtual bool supported() { return false; }
};
struct XSave : private ContextInfo, public FpuInterface {
XSave() : ContextInfo() {
if (sv != SaveMode::FPU_SAVE_XSAVE) {
this->right_mode = false;
} else {
memset(this->_context, 0, 8192);
this->right_mode = true;
}
}
void save() {
if (!active()) {
return;
}
if (((uintptr_t)this->_context % 64) != 0) {
return;
}
_xsave64((void *)this->_context, ((long long)~0));
}
void restore() {
if (!active()) {
return;
}
if (((uintptr_t)this->_context % 64) != 0) {
return;
}
_xrstor64((void *)this->_context, ((long long)~0));
}
bool active() { return this->right_mode; }
bool supported() { return true; }
};
struct XSaveOpt : private ContextInfo, public FpuInterface {
XSaveOpt() : ContextInfo() {
if (sv != SaveMode::FPU_SAVE_XSAVEOPT) {
this->right_mode = false;
} else {
memset(this->_context, 0, 8192);
this->right_mode = true;
}
}
void save() {
if (!active()) {
return;
}
if (((uintptr_t)this->_context % 64) != 0) {
return;
}
_xsaveopt64((void *)this->_context, ((long long)~0));
}
void restore() {
if (!active()) {
return;
}
if (((uintptr_t)this->_context % 64) != 0) {
return;
}
_xrstor64((void *)this->_context, ((long long)~0));
}
bool active() { return this->right_mode; }
bool supported() {
return this->supported_sv & (uint8_t)SaveMode::FPU_SAVE_XSAVEOPT;
}
};
struct XSaveC : private ContextInfo, public FpuInterface {
XSaveC() : ContextInfo() {
if (sv != SaveMode::FPU_SAVE_XSAVEC) {
this->right_mode = false;
} else {
memset(this->_context, 0, 8192);
this->right_mode = true;
}
}
void save() {
if (!active()) {
return;
}
if (((uintptr_t)this->_context % 64) != 0) {
return;
}
_xsavec64((void *)this->_context, ((long long)~0));
}
void restore() {
if (!active()) {
return;
}
if (((uintptr_t)this->_context % 64) != 0) {
return;
}
_xrstor64((void *)this->_context, ((long long)~0));
}
bool active() { return this->right_mode; }
bool supported() {
return this->supported_sv & (uint8_t)SaveMode::FPU_SAVE_XSAVEC;
}
};
std::vector<FpuInterface *> possible_choices[2];
FpuInterface *ctx = nullptr, *ctz = nullptr;
constexpr int perf_cycles = 5;
static void perform_avx_test() {
uint64_t __attribute__((aligned(64)))
test_value[4] = {0xC625FEA5162DF121, 0xC625FEA5162DF121, 0xC625FEA5162DF121,
0xC625FEA5162DF121};
asm volatile("vmovups %0, %%ymm1" ::"m"(*test_value));
// SAVE
ctx->save();
memset(test_value, 0, sizeof(__m256));
asm volatile("vmovups %%ymm1, %0" : "=m"(*test_value));
assert(test_value[0] == 0xC625FEA5162DF121);
// RESTORE ALL ZEROS
ctz->restore();
memset(test_value, 0, sizeof(__m256));
asm volatile("vmovups %%ymm1, %0" : "=m"(*test_value));
assert(test_value[0] == 0);
// RESTORE
ctx->restore();
memset(test_value, 0, sizeof(__m256));
asm volatile("vmovups %%ymm1, %0" : "=m"(*test_value));
assert(test_value[0] == 0xC625FEA5162DF121);
std::cout << "xsave_test: AVX test completed successfully!\n";
}
static void profile_interface(std::string iface_name, FpuInterface *iface) {
uint64_t begin = 0, delta_s = 0, delta_r = 0;
// Profile the save instruction first
for (int i = 0; i < perf_cycles; i++) {
begin = __builtin_ia32_rdtsc();
iface->save();
delta_s += __builtin_ia32_rdtsc() - begin;
}
delta_s /= perf_cycles;
// Then the restore instruction
for (int i = 0; i < perf_cycles; i++) {
begin = __builtin_ia32_rdtsc();
iface->restore();
delta_r += __builtin_ia32_rdtsc() - begin;
}
delta_r /= perf_cycles;
// Finally, print our results
std::cout << " * " << iface_name << " save/restore took " << delta_s
<< " and " << delta_r << " CPU cycles!\n";
}
static void perform_speed_test() {
FpuInterface *xsave = new XSave();
FpuInterface *xsaveopt = new XSaveOpt();
FpuInterface *xsavec = new XSaveC();
// Time the saves for all three methods
std::cout << "\nTimings:\n";
profile_interface(std::string{"XSAVE"}, xsave);
if (xsaveopt->supported())
profile_interface(std::string{"XSAVOPT"}, xsaveopt);
if (xsavec->supported())
profile_interface(std::string{"XSAVEC"}, xsavec);
}
static void dump_information() {
// First of, display all contexts that can be saved
uint32_t a = 0, b = 0, c = 0, d = 0;
cpuid(CPUID_XSAVE_FEATURES, &a, &b, &c, &d);
std::bitset<32> ctx{a};
std::cout << "\nXCR0 Contexts supported:\n";
if (ctx.test(0)) {
std::cout << " * CPU supports x87 instructions!\n";
}
if (ctx.test(1)) {
std::cout << " * CPU supports 128-bit SSE instructions!\n";
}
if (ctx.test(2)) {
std::cout << " * CPU supports 256-bit AVX instructions!\n";
}
if (ctx.test(5) && ctx.test(6) && ctx.test(7)) {
std::cout << " * CPU supports 512-bit AVX instructions!\n";
}
if (ctx.test(3) && ctx.test(4)) {
std::cout << " * CPU supports Intel(R) MPX instructions!\n";
}
if (ctx.test(9)) {
std::cout << " * CPU supports PKU (Protection Keys for Userspace)!\n";
}
std::cout << "\nContext Sizes:\n";
std::cout << " * Standard -> " << c << " bytes\n";
cpuid_subleaf(CPUID_XSAVE_FEATURES, 1, &a, &b, &c, &d);
if (a & 2) {
cpuid(CPUID_XSAVE_FEATURES, &a, &b, &c, &d);
std::cout << " * Compacted -> " << b << " bytes\n";
}
cpuid_subleaf(CPUID_XSAVE_FEATURES, 1, &a, &b, &c, &d);
if (a & 8) {
std::cout << " * Compacted Supervisor -> " << b << " bytes\n";
}
}
int main() {
// Make sure XSAVE is supported!
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
cpuid_subleaf(1, 0x0, &eax, &ebx, &ecx, &edx);
if (!(ecx & (1U << 27U))) {
std::cerr << "xsave_test: XSAVE instruction not supported or enabled, "
"aborting test!\n";
return 1;
}
// Gather all possible choices
for (int i = 0; i < 2; i++) {
possible_choices[i].push_back(new XSave());
possible_choices[i].push_back(new XSaveOpt());
possible_choices[i].push_back(new XSaveC());
}
// Then, find the best choice
for (auto ctx : possible_choices[0]) {
if (ctx->active()) {
::ctx = ctx;
} else {
delete ctx;
}
}
for (auto ctz : possible_choices[1]) {
if (ctz->active()) {
::ctz = ctz;
} else {
delete ctz;
}
}
// Finally, make sure AVX is supported, before actually performing the tests
if (!(ecx & (1U << 28U))) {
std::cerr << "xsave_test: AVX not supported, skipping test!\n";
} else {
perform_avx_test();
}
perform_speed_test();
dump_information();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment