Skip to content

Instantly share code, notes, and snippets.

@cdwfs
Last active December 12, 2021 06:01
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cdwfs/4222ca09cb259f8dd50f7f2cf7d09179 to your computer and use it in GitHub Desktop.
Save cdwfs/4222ca09cb259f8dd50f7f2cf7d09179 to your computer and use it in GitHub Desktop.
Vulkan function to get a pair of timestamps (one CPU, one GPU) corresponding to (very nearly) the same point in absolute wall time.
struct CpuGpuTimestampInfo {
VkDevice device;
VkQueue queue;
uint32_t queue_family_index;
float timestamp_period; // Copy from VkPhysicalDeviceLimits::timestampPeriod
uint32_t timestamp_valid_bits; // Copy from VkQueueFamilyProperties::timestampValidBits
};
VkResult GetCpuGpuTimestamp(const CpuGpuTimestampInfo *info,
std::chrono::high_resolution_clock::time_point *out_cpu_time, uint64_t *out_gpu_time) {
if (info->timestamp_valid_bits == 0) {
return VK_ERROR_FEATURE_NOT_PRESENT; // timestamps not supported on the specified queue
}
VkEventCreateInfo event_ci = {VK_STRUCTURE_TYPE_EVENT_CREATE_INFO};
VkEvent event0 = VK_NULL_HANDLE;
VkResult result = vkCreateEvent(info->device, &event_ci, NULL, &event0);
if (result != VK_SUCCESS) {
return result;
}
VkEvent event1 = VK_NULL_HANDLE;
result = vkCreateEvent(info->device, &event_ci, NULL, &event1);
if (result != VK_SUCCESS) {
return result;
}
VkQueryPoolCreateInfo qpool_ci = {VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO};
qpool_ci.queryType = VK_QUERY_TYPE_TIMESTAMP;
qpool_ci.queryCount = 2;
VkQueryPool qpool = VK_NULL_HANDLE;
result = vkCreateQueryPool(info->device, &qpool_ci, NULL, &qpool);
if (result != VK_SUCCESS) {
return result;
}
VkCommandPoolCreateInfo cpool_ci = {VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO};
cpool_ci.queueFamilyIndex = info->queue_family_index;
cpool_ci.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT;
VkCommandPool cpool = VK_NULL_HANDLE;
result = vkCreateCommandPool(info->device, &cpool_ci, NULL, &cpool);
if (result != VK_SUCCESS) {
return result;
}
VkFenceCreateInfo fence_ci = {VK_STRUCTURE_TYPE_FENCE_CREATE_INFO};
VkFence fence = VK_NULL_HANDLE;
result = vkCreateFence(info->device, &fence_ci, NULL, &fence);
if (result != VK_SUCCESS) {
return result;
}
VkCommandBufferAllocateInfo cb_alloc_info = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO};
cb_alloc_info.commandPool = cpool;
cb_alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
cb_alloc_info.commandBufferCount = 1;
VkCommandBuffer cb = VK_NULL_HANDLE;
result = vkAllocateCommandBuffers(info->device, &cb_alloc_info, &cb);
if (result != VK_SUCCESS) {
return result;
}
VkCommandBufferBeginInfo cb_begin_info = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO};
cb_begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
result = vkBeginCommandBuffer(cb, &cb_begin_info);
if (result != VK_SUCCESS) {
return result;
}
vkCmdWaitEvents(cb, 1, &event0, VK_PIPELINE_STAGE_HOST_BIT,
VK_PIPELINE_STAGE_HOST_BIT | VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, NULL, 0, NULL, 0, NULL);
vkCmdWriteTimestamp(cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, qpool, 0);
vkCmdWaitEvents(
cb, 1, &event1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, NULL, 0, NULL, 0, NULL);
vkCmdWriteTimestamp(cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, qpool, 1);
result = vkEndCommandBuffer(cb);
if (result != VK_SUCCESS) {
return result;
}
VkSubmitInfo submit_info = {VK_STRUCTURE_TYPE_SUBMIT_INFO};
submit_info.commandBufferCount = 1;
submit_info.pCommandBuffers = &cb;
result = vkQueueSubmit(info->queue, 1, &submit_info, fence);
if (result != VK_SUCCESS) {
return result;
}
// Wait until we're reasonably sure the GPU is sitting at the
// first WaitEvents() call before we signal the event.
std::this_thread::sleep_for(std::chrono::seconds(1));
std::chrono::high_resolution_clock::time_point host_times[2];
result = vkSetEvent(info->device, event0);
if (result != VK_SUCCESS) {
return result;
}
host_times[0] = std::chrono::high_resolution_clock::now();
std::this_thread::sleep_for(std::chrono::seconds(1));
result = vkSetEvent(info->device, event1);
if (result != VK_SUCCESS) {
return result;
}
host_times[1] = std::chrono::high_resolution_clock::now();
result = vkWaitForFences(info->device, 1, &fence, VK_TRUE, UINT64_MAX);
if (result != VK_SUCCESS) {
return result;
}
vkDestroyCommandPool(info->device, cpool, NULL);
vkDestroyFence(info->device, fence, NULL);
vkDestroyEvent(info->device, event1, NULL);
vkDestroyEvent(info->device, event0, NULL);
uint64_t raw_device_timestamps[2] = {0, 0};
result = vkGetQueryPoolResults(info->device, qpool, 0, 2, sizeof(raw_device_timestamps), raw_device_timestamps,
sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
if (result != VK_SUCCESS) {
return result;
}
vkDestroyQueryPool(info->device, qpool, NULL);
// raw_device_timestamps[0] and host_times[0] now correspond to the same time point in two different time domains.
*out_cpu_time = host_times[0];
*out_gpu_time = raw_device_timestamps[0];
// Everything beyond here is just approximating the error between the two timestamps, as we know that
// the delta(host_times[1], host_times[0]) should theoretically be identical to delta(raw_device_timestamps[1],
// raw_device_timestamps[0]);
double device_secs[2] = {0.0, 0.0};
const uint64_t timestamp_mask =
(info->timestamp_valid_bits == 64) ? UINT64_MAX : ((1ULL << info->timestamp_valid_bits) - 1);
const double seconds_per_tick = static_cast<double>(info->timestamp_period) / 1e9;
for (uint32_t i = 0; i < 2; ++i) {
raw_device_timestamps[i] &= timestamp_mask;
device_secs[i] = static_cast<double>(raw_device_timestamps[i]) * seconds_per_tick;
};
const auto host_secs =
static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_times[1] - host_times[0]).count()) /
1e9;
printf("%.9f seconds elapsed on host\n", host_secs);
printf("%.9f seconds elapsed on device\n", device_secs[1] - device_secs[0]);
return VK_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment