cdwfs/vk_cpu_gpu_timestamp.cpp

## vk_cpu_gpu_timestamp.cpp
struct CpuGpuTimestampInfo {
  VkDevice device;
  VkQueue queue;
  uint32_t queue_family_index;
  float timestamp_period;  // Copy from VkPhysicalDeviceLimits::timestampPeriod
  uint32_t timestamp_valid_bits;  // Copy from VkQueueFamilyProperties::timestampValidBits
};
VkResult GetCpuGpuTimestamp(const CpuGpuTimestampInfo *info,
    std::chrono::high_resolution_clock::time_point *out_cpu_time, uint64_t *out_gpu_time) {
  if (info->timestamp_valid_bits == 0) {
    return VK_ERROR_FEATURE_NOT_PRESENT;  // timestamps not supported on the specified queue
  }

  VkEventCreateInfo event_ci = {VK_STRUCTURE_TYPE_EVENT_CREATE_INFO};
  VkEvent event0 = VK_NULL_HANDLE;
  VkResult result = vkCreateEvent(info->device, &event_ci, NULL, &event0);
  if (result != VK_SUCCESS) {
    return result;
  }
  VkEvent event1 = VK_NULL_HANDLE;
  result = vkCreateEvent(info->device, &event_ci, NULL, &event1);
  if (result != VK_SUCCESS) {
    return result;
  }
  VkQueryPoolCreateInfo qpool_ci = {VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO};
  qpool_ci.queryType = VK_QUERY_TYPE_TIMESTAMP;
  qpool_ci.queryCount = 2;
  VkQueryPool qpool = VK_NULL_HANDLE;
  result = vkCreateQueryPool(info->device, &qpool_ci, NULL, &qpool);
  if (result != VK_SUCCESS) {
    return result;
  }
  VkCommandPoolCreateInfo cpool_ci = {VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO};
  cpool_ci.queueFamilyIndex = info->queue_family_index;
  cpool_ci.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT;
  VkCommandPool cpool = VK_NULL_HANDLE;
  result = vkCreateCommandPool(info->device, &cpool_ci, NULL, &cpool);
  if (result != VK_SUCCESS) {
    return result;
  }
  VkFenceCreateInfo fence_ci = {VK_STRUCTURE_TYPE_FENCE_CREATE_INFO};
  VkFence fence = VK_NULL_HANDLE;
  result = vkCreateFence(info->device, &fence_ci, NULL, &fence);
  if (result != VK_SUCCESS) {
    return result;
  }
  VkCommandBufferAllocateInfo cb_alloc_info = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO};
  cb_alloc_info.commandPool = cpool;
  cb_alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
  cb_alloc_info.commandBufferCount = 1;
  VkCommandBuffer cb = VK_NULL_HANDLE;
  result = vkAllocateCommandBuffers(info->device, &cb_alloc_info, &cb);
  if (result != VK_SUCCESS) {
    return result;
  }
  VkCommandBufferBeginInfo cb_begin_info = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO};
  cb_begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
  result = vkBeginCommandBuffer(cb, &cb_begin_info);
  if (result != VK_SUCCESS) {
    return result;
  }
  vkCmdWaitEvents(cb, 1, &event0, VK_PIPELINE_STAGE_HOST_BIT,
      VK_PIPELINE_STAGE_HOST_BIT | VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, NULL, 0, NULL, 0, NULL);
  vkCmdWriteTimestamp(cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, qpool, 0);
  vkCmdWaitEvents(
      cb, 1, &event1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, NULL, 0, NULL, 0, NULL);
  vkCmdWriteTimestamp(cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, qpool, 1);
  result = vkEndCommandBuffer(cb);
  if (result != VK_SUCCESS) {
    return result;
  }
  VkSubmitInfo submit_info = {VK_STRUCTURE_TYPE_SUBMIT_INFO};
  submit_info.commandBufferCount = 1;
  submit_info.pCommandBuffers = &cb;
  result = vkQueueSubmit(info->queue, 1, &submit_info, fence);
  if (result != VK_SUCCESS) {
    return result;
  }

  // Wait until we're reasonably sure the GPU is sitting at the
  // first WaitEvents() call before we signal the event.
  std::this_thread::sleep_for(std::chrono::seconds(1));

  std::chrono::high_resolution_clock::time_point host_times[2];
  result = vkSetEvent(info->device, event0);
  if (result != VK_SUCCESS) {
    return result;
  }
  host_times[0] = std::chrono::high_resolution_clock::now();
  std::this_thread::sleep_for(std::chrono::seconds(1));
  result = vkSetEvent(info->device, event1);
  if (result != VK_SUCCESS) {
    return result;
  }
  host_times[1] = std::chrono::high_resolution_clock::now();

  result = vkWaitForFences(info->device, 1, &fence, VK_TRUE, UINT64_MAX);
  if (result != VK_SUCCESS) {
    return result;
  }
  vkDestroyCommandPool(info->device, cpool, NULL);
  vkDestroyFence(info->device, fence, NULL);
  vkDestroyEvent(info->device, event1, NULL);
  vkDestroyEvent(info->device, event0, NULL);

  uint64_t raw_device_timestamps[2] = {0, 0};
  result = vkGetQueryPoolResults(info->device, qpool, 0, 2, sizeof(raw_device_timestamps), raw_device_timestamps,
      sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
  if (result != VK_SUCCESS) {
    return result;
  }
  vkDestroyQueryPool(info->device, qpool, NULL);

  // raw_device_timestamps[0] and host_times[0] now correspond to the same time point in two different time domains.
  *out_cpu_time = host_times[0];
  *out_gpu_time = raw_device_timestamps[0];

  // Everything beyond here is just approximating the error between the two timestamps, as we know that
  // the delta(host_times[1], host_times[0]) should theoretically be identical to delta(raw_device_timestamps[1],
  // raw_device_timestamps[0]);
  double device_secs[2] = {0.0, 0.0};
  const uint64_t timestamp_mask =
      (info->timestamp_valid_bits == 64) ? UINT64_MAX : ((1ULL << info->timestamp_valid_bits) - 1);
  const double seconds_per_tick = static_cast<double>(info->timestamp_period) / 1e9;
  for (uint32_t i = 0; i < 2; ++i) {
    raw_device_timestamps[i] &= timestamp_mask;
    device_secs[i] = static_cast<double>(raw_device_timestamps[i]) * seconds_per_tick;
  };
  const auto host_secs =
      static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_times[1] - host_times[0]).count()) /
      1e9;
  printf("%.9f seconds elapsed on host\n", host_secs);
  printf("%.9f seconds elapsed on device\n", device_secs[1] - device_secs[0]);

  return VK_SUCCESS;
}
	struct CpuGpuTimestampInfo {
	VkDevice device;
	VkQueue queue;
	uint32_t queue_family_index;
	float timestamp_period; // Copy from VkPhysicalDeviceLimits::timestampPeriod
	uint32_t timestamp_valid_bits; // Copy from VkQueueFamilyProperties::timestampValidBits
	};
	VkResult GetCpuGpuTimestamp(const CpuGpuTimestampInfo *info,
	std::chrono::high_resolution_clock::time_point out_cpu_time, uint64_t out_gpu_time) {
	if (info->timestamp_valid_bits == 0) {
	return VK_ERROR_FEATURE_NOT_PRESENT; // timestamps not supported on the specified queue
	}

	VkEventCreateInfo event_ci = {VK_STRUCTURE_TYPE_EVENT_CREATE_INFO};
	VkEvent event0 = VK_NULL_HANDLE;
	VkResult result = vkCreateEvent(info->device, &event_ci, NULL, &event0);
	if (result != VK_SUCCESS) {
	return result;
	}
	VkEvent event1 = VK_NULL_HANDLE;
	result = vkCreateEvent(info->device, &event_ci, NULL, &event1);
	if (result != VK_SUCCESS) {
	return result;
	}
	VkQueryPoolCreateInfo qpool_ci = {VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO};
	qpool_ci.queryType = VK_QUERY_TYPE_TIMESTAMP;
	qpool_ci.queryCount = 2;
	VkQueryPool qpool = VK_NULL_HANDLE;
	result = vkCreateQueryPool(info->device, &qpool_ci, NULL, &qpool);
	if (result != VK_SUCCESS) {
	return result;
	}
	VkCommandPoolCreateInfo cpool_ci = {VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO};
	cpool_ci.queueFamilyIndex = info->queue_family_index;
	cpool_ci.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT;
	VkCommandPool cpool = VK_NULL_HANDLE;
	result = vkCreateCommandPool(info->device, &cpool_ci, NULL, &cpool);
	if (result != VK_SUCCESS) {
	return result;
	}
	VkFenceCreateInfo fence_ci = {VK_STRUCTURE_TYPE_FENCE_CREATE_INFO};
	VkFence fence = VK_NULL_HANDLE;
	result = vkCreateFence(info->device, &fence_ci, NULL, &fence);
	if (result != VK_SUCCESS) {
	return result;
	}
	VkCommandBufferAllocateInfo cb_alloc_info = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO};
	cb_alloc_info.commandPool = cpool;
	cb_alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
	cb_alloc_info.commandBufferCount = 1;
	VkCommandBuffer cb = VK_NULL_HANDLE;
	result = vkAllocateCommandBuffers(info->device, &cb_alloc_info, &cb);
	if (result != VK_SUCCESS) {
	return result;
	}
	VkCommandBufferBeginInfo cb_begin_info = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO};
	cb_begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
	result = vkBeginCommandBuffer(cb, &cb_begin_info);
	if (result != VK_SUCCESS) {
	return result;
	}
	vkCmdWaitEvents(cb, 1, &event0, VK_PIPELINE_STAGE_HOST_BIT,
	VK_PIPELINE_STAGE_HOST_BIT \| VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, NULL, 0, NULL, 0, NULL);
	vkCmdWriteTimestamp(cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, qpool, 0);
	vkCmdWaitEvents(
	cb, 1, &event1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, NULL, 0, NULL, 0, NULL);
	vkCmdWriteTimestamp(cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, qpool, 1);
	result = vkEndCommandBuffer(cb);
	if (result != VK_SUCCESS) {
	return result;
	}
	VkSubmitInfo submit_info = {VK_STRUCTURE_TYPE_SUBMIT_INFO};
	submit_info.commandBufferCount = 1;
	submit_info.pCommandBuffers = &cb;
	result = vkQueueSubmit(info->queue, 1, &submit_info, fence);
	if (result != VK_SUCCESS) {
	return result;
	}

	// Wait until we're reasonably sure the GPU is sitting at the
	// first WaitEvents() call before we signal the event.
	std::this_thread::sleep_for(std::chrono::seconds(1));

	std::chrono::high_resolution_clock::time_point host_times[2];
	result = vkSetEvent(info->device, event0);
	if (result != VK_SUCCESS) {
	return result;
	}
	host_times[0] = std::chrono::high_resolution_clock::now();
	std::this_thread::sleep_for(std::chrono::seconds(1));
	result = vkSetEvent(info->device, event1);
	if (result != VK_SUCCESS) {
	return result;
	}
	host_times[1] = std::chrono::high_resolution_clock::now();

	result = vkWaitForFences(info->device, 1, &fence, VK_TRUE, UINT64_MAX);
	if (result != VK_SUCCESS) {
	return result;
	}
	vkDestroyCommandPool(info->device, cpool, NULL);
	vkDestroyFence(info->device, fence, NULL);
	vkDestroyEvent(info->device, event1, NULL);
	vkDestroyEvent(info->device, event0, NULL);

	uint64_t raw_device_timestamps[2] = {0, 0};
	result = vkGetQueryPoolResults(info->device, qpool, 0, 2, sizeof(raw_device_timestamps), raw_device_timestamps,
	sizeof(uint64_t), VK_QUERY_RESULT_64_BIT \| VK_QUERY_RESULT_WAIT_BIT);
	if (result != VK_SUCCESS) {
	return result;
	}
	vkDestroyQueryPool(info->device, qpool, NULL);

	// raw_device_timestamps[0] and host_times[0] now correspond to the same time point in two different time domains.
	*out_cpu_time = host_times[0];
	*out_gpu_time = raw_device_timestamps[0];

	// Everything beyond here is just approximating the error between the two timestamps, as we know that
	// the delta(host_times[1], host_times[0]) should theoretically be identical to delta(raw_device_timestamps[1],
	// raw_device_timestamps[0]);
	double device_secs[2] = {0.0, 0.0};
	const uint64_t timestamp_mask =
	(info->timestamp_valid_bits == 64) ? UINT64_MAX : ((1ULL << info->timestamp_valid_bits) - 1);
	const double seconds_per_tick = static_cast<double>(info->timestamp_period) / 1e9;
	for (uint32_t i = 0; i < 2; ++i) {
	raw_device_timestamps[i] &= timestamp_mask;
	device_secs[i] = static_cast<double>(raw_device_timestamps[i]) * seconds_per_tick;
	};
	const auto host_secs =
	static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_times[1] - host_times[0]).count()) /
	1e9;
	printf("%.9f seconds elapsed on host\n", host_secs);
	printf("%.9f seconds elapsed on device\n", device_secs[1] - device_secs[0]);

	return VK_SUCCESS;
	}