Skip to content

Instantly share code, notes, and snippets.

@sheredom
Created May 29, 2016 19:14
Show Gist options
  • Save sheredom/523f02bbad2ae397d7ed255f3f3b5a7f to your computer and use it in GitHub Desktop.
Save sheredom/523f02bbad2ae397d7ed255f3f3b5a7f to your computer and use it in GitHub Desktop.
A simple Vulkan compute sample
// This is free and unencumbered software released into the public domain.
//
// Anyone is free to copy, modify, publish, use, compile, sell, or
// distribute this software, either in source code form or as a compiled
// binary, for any purpose, commercial or non-commercial, and by any
// means.
//
// In jurisdictions that recognize copyright laws, the author or authors
// of this software dedicate any and all copyright interest in the
// software to the public domain. We make this dedication for the benefit
// of the public at large and to the detriment of our heirs and
// successors. We intend this dedication to be an overt act of
// relinquishment in perpetuity of all present and future rights to this
// software under copyright law.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// For more information, please refer to <http://unlicense.org/>
#include "vulkan.h"
#include <stdio.h>
#include <stdlib.h>
#define BAIL_ON_BAD_RESULT(result) \
if (VK_SUCCESS != (result)) { fprintf(stderr, "Failure at %u %s\n", __LINE__, __FILE__); exit(-1); }
VkResult vkGetBestTransferQueueNPH(VkPhysicalDevice physicalDevice, uint32_t* queueFamilyIndex) {
uint32_t queueFamilyPropertiesCount = 0;
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0);
VkQueueFamilyProperties* const queueFamilyProperties = (VkQueueFamilyProperties*)_alloca(
sizeof(VkQueueFamilyProperties) * queueFamilyPropertiesCount);
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties);
// first try and find a queue that has just the transfer bit set
for (uint32_t i = 0; i < queueFamilyPropertiesCount; i++) {
// mask out the sparse binding bit that we aren't caring about (yet!)
const VkQueueFlags maskedFlags = (~VK_QUEUE_SPARSE_BINDING_BIT & queueFamilyProperties[i].queueFlags);
if (!((VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT) & maskedFlags) &&
(VK_QUEUE_TRANSFER_BIT & maskedFlags)) {
*queueFamilyIndex = i;
return VK_SUCCESS;
}
}
// otherwise we'll prefer using a compute-only queue,
// remember that having compute on the queue implicitly enables transfer!
for (uint32_t i = 0; i < queueFamilyPropertiesCount; i++) {
// mask out the sparse binding bit that we aren't caring about (yet!)
const VkQueueFlags maskedFlags = (~VK_QUEUE_SPARSE_BINDING_BIT & queueFamilyProperties[i].queueFlags);
if (!(VK_QUEUE_GRAPHICS_BIT & maskedFlags) && (VK_QUEUE_COMPUTE_BIT & maskedFlags)) {
*queueFamilyIndex = i;
return VK_SUCCESS;
}
}
// lastly get any queue that'll work for us (graphics, compute or transfer bit set)
for (uint32_t i = 0; i < queueFamilyPropertiesCount; i++) {
// mask out the sparse binding bit that we aren't caring about (yet!)
const VkQueueFlags maskedFlags = (~VK_QUEUE_SPARSE_BINDING_BIT & queueFamilyProperties[i].queueFlags);
if ((VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT) & maskedFlags) {
*queueFamilyIndex = i;
return VK_SUCCESS;
}
}
return VK_ERROR_INITIALIZATION_FAILED;
}
VkResult vkGetBestComputeQueueNPH(VkPhysicalDevice physicalDevice, uint32_t* queueFamilyIndex) {
uint32_t queueFamilyPropertiesCount = 0;
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0);
VkQueueFamilyProperties* const queueFamilyProperties = (VkQueueFamilyProperties*)_alloca(
sizeof(VkQueueFamilyProperties) * queueFamilyPropertiesCount);
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties);
// first try and find a queue that has just the compute bit set
for (uint32_t i = 0; i < queueFamilyPropertiesCount; i++) {
// mask out the sparse binding bit that we aren't caring about (yet!) and the transfer bit
const VkQueueFlags maskedFlags = (~(VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT) &
queueFamilyProperties[i].queueFlags);
if (!(VK_QUEUE_GRAPHICS_BIT & maskedFlags) && (VK_QUEUE_COMPUTE_BIT & maskedFlags)) {
*queueFamilyIndex = i;
return VK_SUCCESS;
}
}
// lastly get any queue that'll work for us
for (uint32_t i = 0; i < queueFamilyPropertiesCount; i++) {
// mask out the sparse binding bit that we aren't caring about (yet!) and the transfer bit
const VkQueueFlags maskedFlags = (~(VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT) &
queueFamilyProperties[i].queueFlags);
if (VK_QUEUE_COMPUTE_BIT & maskedFlags) {
*queueFamilyIndex = i;
return VK_SUCCESS;
}
}
return VK_ERROR_INITIALIZATION_FAILED;
}
int main(int argc, const char * const argv[]) {
(void)argc;
(void)argv;
const VkApplicationInfo applicationInfo = {
VK_STRUCTURE_TYPE_APPLICATION_INFO,
0,
"VKComputeSample",
0,
"",
0,
VK_MAKE_VERSION(1, 0, 9)
};
const VkInstanceCreateInfo instanceCreateInfo = {
VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
0,
0,
&applicationInfo,
0,
0,
0,
0
};
VkInstance instance;
BAIL_ON_BAD_RESULT(vkCreateInstance(&instanceCreateInfo, 0, &instance));
uint32_t physicalDeviceCount = 0;
BAIL_ON_BAD_RESULT(vkEnumeratePhysicalDevices(instance, &physicalDeviceCount, 0));
VkPhysicalDevice* const physicalDevices = (VkPhysicalDevice*)malloc(
sizeof(VkPhysicalDevice) * physicalDeviceCount);
BAIL_ON_BAD_RESULT(vkEnumeratePhysicalDevices(instance, &physicalDeviceCount, physicalDevices));
for (uint32_t i = 0; i < physicalDeviceCount; i++) {
uint32_t queueFamilyIndex = 0;
BAIL_ON_BAD_RESULT(vkGetBestComputeQueueNPH(physicalDevices[i], &queueFamilyIndex));
const float queuePrioritory = 1.0f;
const VkDeviceQueueCreateInfo deviceQueueCreateInfo = {
VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
0,
0,
queueFamilyIndex,
1,
&queuePrioritory
};
const VkDeviceCreateInfo deviceCreateInfo = {
VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
0,
0,
1,
&deviceQueueCreateInfo,
0,
0,
0,
0,
0
};
VkDevice device;
BAIL_ON_BAD_RESULT(vkCreateDevice(physicalDevices[i], &deviceCreateInfo, 0, &device));
VkPhysicalDeviceMemoryProperties properties;
vkGetPhysicalDeviceMemoryProperties(physicalDevices[i], &properties);
const int32_t bufferLength = 16384;
const uint32_t bufferSize = sizeof(int32_t) * bufferLength;
// we are going to need two buffers from this one memory
const VkDeviceSize memorySize = bufferSize * 2;
// set memoryTypeIndex to an invalid entry in the properties.memoryTypes array
uint32_t memoryTypeIndex = VK_MAX_MEMORY_TYPES;
for (uint32_t k = 0; k < properties.memoryTypeCount; k++) {
if ((VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT & properties.memoryTypes[k].propertyFlags) &&
(VK_MEMORY_PROPERTY_HOST_COHERENT_BIT & properties.memoryTypes[k].propertyFlags) &&
(memorySize < properties.memoryHeaps[properties.memoryTypes[k].heapIndex].size)) {
memoryTypeIndex = k;
break;
}
}
BAIL_ON_BAD_RESULT(memoryTypeIndex == VK_MAX_MEMORY_TYPES ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS);
const VkMemoryAllocateInfo memoryAllocateInfo = {
VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
0,
memorySize,
memoryTypeIndex
};
VkDeviceMemory memory;
BAIL_ON_BAD_RESULT(vkAllocateMemory(device, &memoryAllocateInfo, 0, &memory));
int32_t *payload;
BAIL_ON_BAD_RESULT(vkMapMemory(device, memory, 0, memorySize, 0, (void *)&payload));
for (uint32_t k = 1; k < memorySize / sizeof(int32_t); k++) {
payload[k] = rand();
}
vkUnmapMemory(device, memory);
const VkBufferCreateInfo bufferCreateInfo = {
VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
0,
0,
bufferSize,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
VK_SHARING_MODE_EXCLUSIVE,
1,
&queueFamilyIndex
};
VkBuffer in_buffer;
BAIL_ON_BAD_RESULT(vkCreateBuffer(device, &bufferCreateInfo, 0, &in_buffer));
BAIL_ON_BAD_RESULT(vkBindBufferMemory(device, in_buffer, memory, 0));
VkBuffer out_buffer;
BAIL_ON_BAD_RESULT(vkCreateBuffer(device, &bufferCreateInfo, 0, &out_buffer));
BAIL_ON_BAD_RESULT(vkBindBufferMemory(device, out_buffer, memory, bufferSize));
enum {
RESERVED_ID = 0,
FUNC_ID,
IN_ID,
OUT_ID,
GLOBAL_INVOCATION_ID,
VOID_TYPE_ID,
FUNC_TYPE_ID,
INT_TYPE_ID,
INT_ARRAY_TYPE_ID,
STRUCT_ID,
POINTER_TYPE_ID,
ELEMENT_POINTER_TYPE_ID,
INT_VECTOR_TYPE_ID,
INT_VECTOR_POINTER_TYPE_ID,
INT_POINTER_TYPE_ID,
CONSTANT_ZERO_ID,
CONSTANT_ARRAY_LENGTH_ID,
LABEL_ID,
IN_ELEMENT_ID,
OUT_ELEMENT_ID,
GLOBAL_INVOCATION_X_ID,
GLOBAL_INVOCATION_X_PTR_ID,
TEMP_LOADED_ID,
BOUND
};
enum {
INPUT = 1,
UNIFORM = 2,
BUFFER_BLOCK = 3,
ARRAY_STRIDE = 6,
BUILTIN = 11,
BINDING = 33,
OFFSET = 35,
DESCRIPTOR_SET = 34,
GLOBAL_INVOCATION = 28,
OP_TYPE_VOID = 19,
OP_TYPE_FUNCTION = 33,
OP_TYPE_INT = 21,
OP_TYPE_VECTOR = 23,
OP_TYPE_ARRAY = 28,
OP_TYPE_STRUCT = 30,
OP_TYPE_POINTER = 32,
OP_VARIABLE = 59,
OP_DECORATE = 71,
OP_MEMBER_DECORATE = 72,
OP_FUNCTION = 54,
OP_LABEL = 248,
OP_ACCESS_CHAIN = 65,
OP_CONSTANT = 43,
OP_LOAD = 61,
OP_STORE = 62,
OP_RETURN = 253,
OP_FUNCTION_END = 56,
OP_CAPABILITY = 17,
OP_MEMORY_MODEL = 14,
OP_ENTRY_POINT = 15,
OP_EXECUTION_MODE = 16,
OP_COMPOSITE_EXTRACT = 81,
};
int32_t shader[] = {
// first is the SPIR-V header
0x07230203, // magic header ID
0x00010000, // version 1.0.0
0, // generator (optional)
BOUND, // bound
0, // schema
// OpCapability Shader
(2 << 16) | OP_CAPABILITY, 1,
// OpMemoryModel Logical Simple
(3 << 16) | OP_MEMORY_MODEL, 0, 0,
// OpEntryPoint GLCompute %FUNC_ID "f" %IN_ID %OUT_ID
(4 << 16) | OP_ENTRY_POINT, 5, FUNC_ID, 0x00000066,
// OpExecutionMode %FUNC_ID LocalSize 1 1 1
(6 << 16) | OP_EXECUTION_MODE, FUNC_ID, 17, 1, 1, 1,
// next declare decorations
(3 << 16) | OP_DECORATE, STRUCT_ID, BUFFER_BLOCK,
(4 << 16) | OP_DECORATE, GLOBAL_INVOCATION_ID, BUILTIN, GLOBAL_INVOCATION,
(4 << 16) | OP_DECORATE, IN_ID, DESCRIPTOR_SET, 0,
(4 << 16) | OP_DECORATE, IN_ID, BINDING, 0,
(4 << 16) | OP_DECORATE, OUT_ID, DESCRIPTOR_SET, 0,
(4 << 16) | OP_DECORATE, OUT_ID, BINDING, 1,
(4 << 16) | OP_DECORATE, INT_ARRAY_TYPE_ID, ARRAY_STRIDE, 4,
(5 << 16) | OP_MEMBER_DECORATE, STRUCT_ID, 0, OFFSET, 0,
// next declare types
(2 << 16) | OP_TYPE_VOID, VOID_TYPE_ID,
(3 << 16) | OP_TYPE_FUNCTION, FUNC_TYPE_ID, VOID_TYPE_ID,
(4 << 16) | OP_TYPE_INT, INT_TYPE_ID, 32, 1,
(4 << 16) | OP_CONSTANT, INT_TYPE_ID, CONSTANT_ARRAY_LENGTH_ID, bufferLength,
(4 << 16) | OP_TYPE_ARRAY, INT_ARRAY_TYPE_ID, INT_TYPE_ID, CONSTANT_ARRAY_LENGTH_ID,
(3 << 16) | OP_TYPE_STRUCT, STRUCT_ID, INT_ARRAY_TYPE_ID,
(4 << 16) | OP_TYPE_POINTER, POINTER_TYPE_ID, UNIFORM, STRUCT_ID,
(4 << 16) | OP_TYPE_POINTER, ELEMENT_POINTER_TYPE_ID, UNIFORM, INT_TYPE_ID,
(4 << 16) | OP_TYPE_VECTOR, INT_VECTOR_TYPE_ID, INT_TYPE_ID, 3,
(4 << 16) | OP_TYPE_POINTER, INT_VECTOR_POINTER_TYPE_ID, INPUT, INT_VECTOR_TYPE_ID,
(4 << 16) | OP_TYPE_POINTER, INT_POINTER_TYPE_ID, INPUT, INT_TYPE_ID,
// then declare constants
(4 << 16) | OP_CONSTANT, INT_TYPE_ID, CONSTANT_ZERO_ID, 0,
// then declare variables
(4 << 16) | OP_VARIABLE, POINTER_TYPE_ID, IN_ID, UNIFORM,
(4 << 16) | OP_VARIABLE, POINTER_TYPE_ID, OUT_ID, UNIFORM,
(4 << 16) | OP_VARIABLE, INT_VECTOR_POINTER_TYPE_ID, GLOBAL_INVOCATION_ID, INPUT,
// then declare function
(5 << 16) | OP_FUNCTION, VOID_TYPE_ID, FUNC_ID, 0, FUNC_TYPE_ID,
(2 << 16) | OP_LABEL, LABEL_ID,
(5 << 16) | OP_ACCESS_CHAIN, INT_POINTER_TYPE_ID, GLOBAL_INVOCATION_X_PTR_ID, GLOBAL_INVOCATION_ID, CONSTANT_ZERO_ID,
(4 << 16) | OP_LOAD, INT_TYPE_ID, GLOBAL_INVOCATION_X_ID, GLOBAL_INVOCATION_X_PTR_ID,
(6 << 16) | OP_ACCESS_CHAIN, ELEMENT_POINTER_TYPE_ID, IN_ELEMENT_ID, IN_ID, CONSTANT_ZERO_ID, GLOBAL_INVOCATION_X_ID,
(4 << 16) | OP_LOAD, INT_TYPE_ID, TEMP_LOADED_ID, IN_ELEMENT_ID,
(6 << 16) | OP_ACCESS_CHAIN, ELEMENT_POINTER_TYPE_ID, OUT_ELEMENT_ID, OUT_ID, CONSTANT_ZERO_ID, GLOBAL_INVOCATION_X_ID,
(3 << 16) | OP_STORE, OUT_ELEMENT_ID, TEMP_LOADED_ID,
(1 << 16) | OP_RETURN,
(1 << 16) | OP_FUNCTION_END,
};
VkShaderModuleCreateInfo shaderModuleCreateInfo = {
VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
0,
0,
sizeof(shader),
shader
};
VkShaderModule shader_module;
BAIL_ON_BAD_RESULT(vkCreateShaderModule(device, &shaderModuleCreateInfo, 0, &shader_module));
VkDescriptorSetLayoutBinding descriptorSetLayoutBindings[2] = {
{
0,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1,
VK_SHADER_STAGE_COMPUTE_BIT,
0
},
{
1,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1,
VK_SHADER_STAGE_COMPUTE_BIT,
0
}
};
VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = {
VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
0,
0,
2,
descriptorSetLayoutBindings
};
VkDescriptorSetLayout descriptorSetLayout;
BAIL_ON_BAD_RESULT(vkCreateDescriptorSetLayout(device, &descriptorSetLayoutCreateInfo, 0, &descriptorSetLayout));
VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = {
VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
0,
0,
1,
&descriptorSetLayout,
0,
0
};
VkPipelineLayout pipelineLayout;
BAIL_ON_BAD_RESULT(vkCreatePipelineLayout(device, &pipelineLayoutCreateInfo, 0, &pipelineLayout));
VkComputePipelineCreateInfo computePipelineCreateInfo = {
VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
0,
0,
{
VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
0,
0,
VK_SHADER_STAGE_COMPUTE_BIT,
shader_module,
"f",
0
},
pipelineLayout,
0,
0
};
VkPipeline pipeline;
BAIL_ON_BAD_RESULT(vkCreateComputePipelines(device, 0, 1, &computePipelineCreateInfo, 0, &pipeline));
VkCommandPoolCreateInfo commandPoolCreateInfo = {
VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
0,
0,
queueFamilyIndex
};
VkDescriptorPoolSize descriptorPoolSize = {
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
2
};
VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = {
VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
0,
0,
1,
1,
&descriptorPoolSize
};
VkDescriptorPool descriptorPool;
BAIL_ON_BAD_RESULT(vkCreateDescriptorPool(device, &descriptorPoolCreateInfo, 0, &descriptorPool));
VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = {
VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
0,
descriptorPool,
1,
&descriptorSetLayout
};
VkDescriptorSet descriptorSet;
BAIL_ON_BAD_RESULT(vkAllocateDescriptorSets(device, &descriptorSetAllocateInfo, &descriptorSet));
VkDescriptorBufferInfo in_descriptorBufferInfo = {
in_buffer,
0,
VK_WHOLE_SIZE
};
VkDescriptorBufferInfo out_descriptorBufferInfo = {
out_buffer,
0,
VK_WHOLE_SIZE
};
VkWriteDescriptorSet writeDescriptorSet[2] = {
{
VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
0,
descriptorSet,
0,
0,
1,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
0,
&in_descriptorBufferInfo,
0
},
{
VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
0,
descriptorSet,
1,
0,
1,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
0,
&out_descriptorBufferInfo,
0
}
};
vkUpdateDescriptorSets(device, 2, writeDescriptorSet, 0, 0);
VkCommandPool commandPool;
BAIL_ON_BAD_RESULT(vkCreateCommandPool(device, &commandPoolCreateInfo, 0, &commandPool));
VkCommandBufferAllocateInfo commandBufferAllocateInfo = {
VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
0,
commandPool,
VK_COMMAND_BUFFER_LEVEL_PRIMARY,
1
};
VkCommandBuffer commandBuffer;
BAIL_ON_BAD_RESULT(vkAllocateCommandBuffers(device, &commandBufferAllocateInfo, &commandBuffer));
VkCommandBufferBeginInfo commandBufferBeginInfo = {
VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
0,
VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
0
};
BAIL_ON_BAD_RESULT(vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo));
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
pipelineLayout, 0, 1, &descriptorSet, 0, 0);
vkCmdDispatch(commandBuffer, bufferSize / sizeof(int32_t), 1, 1);
BAIL_ON_BAD_RESULT(vkEndCommandBuffer(commandBuffer));
VkQueue queue;
vkGetDeviceQueue(device, queueFamilyIndex, 0, &queue);
VkSubmitInfo submitInfo = {
VK_STRUCTURE_TYPE_SUBMIT_INFO,
0,
0,
0,
0,
1,
&commandBuffer,
0,
0
};
BAIL_ON_BAD_RESULT(vkQueueSubmit(queue, 1, &submitInfo, 0));
BAIL_ON_BAD_RESULT(vkQueueWaitIdle(queue));
BAIL_ON_BAD_RESULT(vkMapMemory(device, memory, 0, memorySize, 0, (void *)&payload));
for (uint32_t k = 0, e = bufferSize / sizeof(int32_t); k < e; k++) {
BAIL_ON_BAD_RESULT(payload[k + e] == payload[k] ? VK_SUCCESS : VK_ERROR_OUT_OF_HOST_MEMORY);
}
}
}
@michaelboccara
Copy link

Thanks for your code sample
Can you explain me what is the NPH suffix and what it means ?
I can't find any explanation on the web...
Thanks!

@sheredom
Copy link
Author

the NPH is just my initials - Neil Paul Henning. Since C doesn't have any namespaces I decided to namespace the methods!

@FrickHazard
Copy link

This is great. I think learning Vulkan through the compute side first makes a lot more sense, rather than going through the very long graphics setup.

@Doi6doi
Copy link

Doi6doi commented Sep 15, 2020

UNASSIGNED-CoreValidation-Shader-InconsistentSpirv(ERROR / SPEC): msgNum: 0 - SPIR-V module not valid: Interface variable id <4> is used by entry point 'f' id <1>, but is not listed as an interface
%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3int Input

the module validates fine for me after changing line 325 to:
(4 << 16) | OP_ENTRY_POINT, 5, FUNC_ID, 0x66, GLOBAL_INVOCATION_ID

@mehmet-soyturk
Copy link

UNASSIGNED-CoreValidation-Shader-InconsistentSpirv(ERROR / SPEC): msgNum: 0 - SPIR-V module not valid: Interface variable id <4> is used by entry point 'f' id <1>, but is not listed as an interface
%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3int Input

the module validates fine for me after changing line 325 to:
(4 << 16) | OP_ENTRY_POINT, 5, FUNC_ID, 0x66, GLOBAL_INVOCATION_ID

It works with me with (5 << 16) | OP_ENTRY_POINT, 5, FUNC_ID, 0x66, GLOBAL_INVOCATION_ID.

Seemingly (x << 16) means "the current instruction consists of x uint32_t values."

@mehmet-soyturk
Copy link

mehmet-soyturk commented Dec 30, 2020

I measured times with the following code:

        auto tm0 = std::chrono::high_resolution_clock::now();
        BAIL_ON_BAD_RESULT(vkQueueSubmit(queue, 1, &submitInfo, 0));
        BAIL_ON_BAD_RESULT(vkQueueWaitIdle(queue));
        auto tm1 = std::chrono::high_resolution_clock::now();
        BAIL_ON_BAD_RESULT(vkMapMemory(device, memory, 0, memorySize, 0, (void **)&payload));
        auto tm2 = std::chrono::high_resolution_clock::now();

        for (uint32_t k = 0, e = bufferSize / sizeof(int32_t); k < e; k++) {
            BAIL_ON_BAD_RESULT(payload[k + e] == payload[k] ? VK_SUCCESS : VK_ERROR_OUT_OF_HOST_MEMORY);
        }
        auto tm3 = std::chrono::high_resolution_clock::now();
        using milliseconds = std::chrono::duration<double, std::milli>;
        milliseconds tmProcess = tm1-tm0;
        milliseconds tmMap     = tm2-tm1;
        milliseconds tmRead    = tm3-tm2;
        printf("Times (ms):\n    process: %f\n    map    : %f\n    read    : %f",
            tmProcess.count(), tmMap.count(), tmRead.count());

NVidia Geforce RTX 2080 super, Windows 10, Core I9-10980HK.

Times (ms):
    process: 0.315900
    map    : 0.000300
    read    : 3.107100

Reading back the memory takes most of the time because a memory without VK_MEMORY_PROPERTY_HOST_CACHED_BIT was being selected. If I change memory selection code like this:

        for (uint32_t k = 0; k < properties.memoryTypeCount; k++) {
            if ((VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT & properties.memoryTypes[k].propertyFlags) &&
                (VK_MEMORY_PROPERTY_HOST_COHERENT_BIT & properties.memoryTypes[k].propertyFlags) &&
                (VK_MEMORY_PROPERTY_HOST_CACHED_BIT & properties.memoryTypes[k].propertyFlags) &&
                (memorySize < properties.memoryHeaps[properties.memoryTypes[k].heapIndex].size)) {
                memoryTypeIndex = k;
                break;
            }
        }

then I get

Times (ms):
    process: 0.327000
    map    : 0.000500
    read    : 0.010500

@grzavala
Copy link

grzavala commented Mar 14, 2021

Thank you to everyone who participates because it enriches the development of NPH.

VkQueueFamilyProperties* const queueFamilyProperties = (VkQueueFamilyProperties*)_alloca(
sizeof(VkQueueFamilyProperties) * queueFamilyPropertiesCount);

in Visual Studio Comunity advise me to change: _alloca for _malloca

@stolk
Copy link

stolk commented Sep 9, 2023

@xakepp35 You can write compute shader in GLSL and then compile to SPIR-V.

Or alternatively, write an OpenCL kernel and then compile to SPIR-V using clspv

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment