w-flo/test-ssbo-std430.cpp

## test-ssbo-std430.cpp
/*******

Compilation:
g++ test-ssbo-std430.cpp $(pkg-config --cflags --libs sdl2 glew) && MESA_DEBUG=true ./a.out

renderdoc 0.91 (e.g. git de7a74c7) can be configured to queue capture of frame 2 to inspect the results of this test program.

This is a simple test program to isolate one of the issues I encountered using my HD 7870 GPU and the Mesa radeonsi driver (amdgpu kernel module) while testing the Banshee 3D engine.
Mesa version is 17.2.4-0ubuntu1~17.10.1 (from Ubuntu artful-proposed archive).

This prepares a simple compute shader that sets a few values in a structured shader storage buffer.
That buffer is accessed through an array of ResultRecords, using the std430 memory layout:
struct ResultRecord
{
	float a[10];
	float b[10];
	float c[10];
	float weight;
};

There are 6 threads, each one creates a temporary local variable of the struct type,
then sets some numbers in that struct, and then assigns it
to the thread's slot in the ssb[] array representing the shader storage buffer.

After the 6 shader threads finish, those values should be set for 0<=i<6:
ssb[i].weight=i
ssb[i].a[i]=i
ssb[i].b[i]=i
ssb[i].c[i]=i

But when looking at the buffer contents in renderdoc, only the values for a[] were set correctly by all threads.
The values for b[] are off by 30 floats, e.g. the value 0 meant for ssb[0].b[0] ends up in ssb[1].a[9].
The values for c[] are off by 60 floats, e.g. the value 0 meant for ssb[0].c[0] ends up in ssb[2].b[8].
The weight values are most likely off by 90 floats, but only the value 5 meant for ssb[5].weight can be seen in the buffer at the location ssb[8].c[7].
Weights for threads 0-4 are probably overwritten by 0-values from a, b or c.
This also leads to writes out of the expected bounds, if the buffer is large enough (this program allocates a buffer 10x the expected size so this is visible in renderdoc).

When moving the "weight" member up to the top of the ResultRecord struct, the weight values end up where I expect them to be.
But a, b and c are off by 3, 33 and 63 floats, respectively.

Three different program variants that work correctly are included, but commented out:
- When assigning the a, b, c arrays and the weight float to ssb[i] individually instead of the whole struct in one assignment, everything works as expected.
- When assigning the values in ssb[i].a[n] etc. directly instead of using the temporary local struct variable, everything works as expected.
- When changing the floats to vec4s, everything works as expected.

So it seems like there are alignment issues when using the assignment operator to assign a struct value to the std430 ssb.
The next float after a single float or array of floats will be vec4-aligned instead of float-aligned, while intra-array-alignment is as expected (no padding, float-alignment).
Even though elements in the buffer should not be padded to be vec4-aligned AIUI, the code that copies over structs on assignment to the SSB appears to align them.
But this is not a general issue when accessing shader storage buffers, only when assigning struct variables to an array that is backed by the SSBO.


After looking at some of the generated TGSI instructions, I think the issue is already visible there.
The IMM array apparently contains (among other things) buffer offsets for transfering values from the local "result" variable (part of TEMP in TGSI) into the buffer:
IMM[6] UINT32 {168, 164, 160, 36}
IMM[7] UINT32 {32, 28, 24, 20}
IMM[8] UINT32 {16, 12, 8, 4}
Correct offsets for a[] values are 0,4,8,12,16,20,24,28,32,36, as reflected by IMM[6-8].
But the next offset for b[] should be 40 bytes, while the IMM[] array only offers 160, i.e. a 120 bytes / 30 floats difference as seen above.


*******/


#include <string>
#include <iostream>

#include <unistd.h>

#include <GL/glew.h>
#include <SDL2/SDL.h>

#define NUM_COEFF_SETS 6

struct ResultRecord
{
	float a[10];
	float b[10];
	float c[10];
	float weight;
};


// Works correctly when using vec4 instead of float (need to change the shader's struct definition and the glsl code, too)
/*
struct ResultRecord
{
	float a[4*10];
	float b[4*10];
	float c[4*10];
	float weight;
	float unusedWeightVectorParts[3];
};
*/


std::string shaderSrc = R"(

#version 450
#define OPENGL
#define OPENGL450

layout(local_size_x = 6, local_size_y = 1, local_size_z = 1) in;

struct ResultRecord
{
	float a[10];
	float b[10];
	float c[10];
	float weight;
};

// Works correctly when using vec4 instead of float
/*
struct ResultRecord
{
	vec4 a[10];
	vec4 b[10];
	vec4 c[10];
	vec4 weight;
};
*/

layout(std430) buffer gOutput
{
	ResultRecord ssb[];
};

void main()
{
	// Fails when using the intermediate "result" variable
	ResultRecord result;
	for (int i=0; i<10; i++) {
		result.a[i] = 0;
		result.b[i] = 0;
		result.c[i] = 0;
	}
	result.weight = gl_LocalInvocationIndex;
	result.a[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	result.b[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	result.c[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	// This fails:
	ssb[gl_LocalInvocationIndex] = result;

	// This works as a replacement for the previous line:
	/*
	ssb[gl_LocalInvocationIndex].a = result.a;
	ssb[gl_LocalInvocationIndex].b = result.b;
	ssb[gl_LocalInvocationIndex].c = result.c;
	ssb[gl_LocalInvocationIndex].weight = result.weight;
	*/

	// Works as expected when assigning values directly to ssb as a replacement for all the code above this:
	/*
	ssb[gl_LocalInvocationIndex].weight = gl_LocalInvocationIndex;
	ssb[gl_LocalInvocationIndex].a[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	ssb[gl_LocalInvocationIndex].b[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	ssb[gl_LocalInvocationIndex].c[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	*/

	// Also works as expected when using vec4 instead of float as a replacement for all the code above this:
	/*
	ResultRecord result;
	for (int i=0; i<10; i++) {
		result.a[i] = vec4(0);
		result.b[i] = vec4(0);
		result.c[i] = vec4(0);
	}
	result.weight = vec4(gl_LocalInvocationIndex,0,0,0);
	result.a[gl_LocalInvocationIndex].x = gl_LocalInvocationIndex;
	result.b[gl_LocalInvocationIndex].x = gl_LocalInvocationIndex;
	result.c[gl_LocalInvocationIndex].x = gl_LocalInvocationIndex;
	ssb[gl_LocalInvocationIndex] = result;
	*/
}

)"; //// End of shader

int main(int argc, char* argv[]) {
	if (SDL_Init(SDL_INIT_VIDEO) < 0) {
		std::cout << "Failed to init SDL\n";
		return 1;
	}

	SDL_Window *window = SDL_CreateWindow("Compute Shader Test", 0, 0, 200, 50, SDL_WINDOW_OPENGL|SDL_WINDOW_RESIZABLE);
	SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
	SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
	SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
	SDL_GLContext glcontext = SDL_GL_CreateContext(window);

	glewInit();
	sleep(1);
	SDL_GL_SwapWindow(window);

	// Prepare shader program and pipeline
	const char* shaderSrcChars = shaderSrc.c_str();
	GLuint csProgramId = glCreateShaderProgramv(GL_COMPUTE_SHADER, 1, &shaderSrcChars);
	GLuint pipelineId = 0;
	glGenProgramPipelines(1, &pipelineId);
	glUseProgramStages(pipelineId, GL_COMPUTE_SHADER_BIT, csProgramId);
	glBindProgramPipeline(pipelineId);

	// Prepare SSBO (10x the needed size to make overflows visible)
	GLuint ssboId = 0;
	glGenBuffers(1, &ssboId);
	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboId);
	glBufferData(GL_SHADER_STORAGE_BUFFER, 10 * NUM_COEFF_SETS * sizeof(ResultRecord), nullptr, GL_DYNAMIC_DRAW);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssboId);
	glShaderStorageBlockBinding(csProgramId, 0, 0);

	// let the magic happen
	glDispatchCompute(1, 1, 1);
	SDL_GL_SwapWindow(window);

	// read the shader storage buffer and print what's stored as weight
	ResultRecord ssb[NUM_COEFF_SETS];
	glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(ssb), &ssb);

	for (int i=0; i<NUM_COEFF_SETS; i++) {
		std::cout << i << ": weight " << ssb[i].weight << " (should be " << i << ")" << std::endl;
	}

	sleep(1);
	SDL_GL_DeleteContext(glcontext);
}
	/*******

	Compilation:
	g++ test-ssbo-std430.cpp $(pkg-config --cflags --libs sdl2 glew) && MESA_DEBUG=true ./a.out

	renderdoc 0.91 (e.g. git de7a74c7) can be configured to queue capture of frame 2 to inspect the results of this test program.

	This is a simple test program to isolate one of the issues I encountered using my HD 7870 GPU and the Mesa radeonsi driver (amdgpu kernel module) while testing the Banshee 3D engine.
	Mesa version is 17.2.4-0ubuntu1~17.10.1 (from Ubuntu artful-proposed archive).

	This prepares a simple compute shader that sets a few values in a structured shader storage buffer.
	That buffer is accessed through an array of ResultRecords, using the std430 memory layout:
	struct ResultRecord
	{
	float a[10];
	float b[10];
	float c[10];
	float weight;
	};

	There are 6 threads, each one creates a temporary local variable of the struct type,
	then sets some numbers in that struct, and then assigns it
	to the thread's slot in the ssb[] array representing the shader storage buffer.

	After the 6 shader threads finish, those values should be set for 0<=i<6:
	ssb[i].weight=i
	ssb[i].a[i]=i
	ssb[i].b[i]=i
	ssb[i].c[i]=i

	But when looking at the buffer contents in renderdoc, only the values for a[] were set correctly by all threads.
	The values for b[] are off by 30 floats, e.g. the value 0 meant for ssb[0].b[0] ends up in ssb[1].a[9].
	The values for c[] are off by 60 floats, e.g. the value 0 meant for ssb[0].c[0] ends up in ssb[2].b[8].
	The weight values are most likely off by 90 floats, but only the value 5 meant for ssb[5].weight can be seen in the buffer at the location ssb[8].c[7].
	Weights for threads 0-4 are probably overwritten by 0-values from a, b or c.
	This also leads to writes out of the expected bounds, if the buffer is large enough (this program allocates a buffer 10x the expected size so this is visible in renderdoc).

	When moving the "weight" member up to the top of the ResultRecord struct, the weight values end up where I expect them to be.
	But a, b and c are off by 3, 33 and 63 floats, respectively.

	Three different program variants that work correctly are included, but commented out:
	- When assigning the a, b, c arrays and the weight float to ssb[i] individually instead of the whole struct in one assignment, everything works as expected.
	- When assigning the values in ssb[i].a[n] etc. directly instead of using the temporary local struct variable, everything works as expected.
	- When changing the floats to vec4s, everything works as expected.

	So it seems like there are alignment issues when using the assignment operator to assign a struct value to the std430 ssb.
	The next float after a single float or array of floats will be vec4-aligned instead of float-aligned, while intra-array-alignment is as expected (no padding, float-alignment).
	Even though elements in the buffer should not be padded to be vec4-aligned AIUI, the code that copies over structs on assignment to the SSB appears to align them.
	But this is not a general issue when accessing shader storage buffers, only when assigning struct variables to an array that is backed by the SSBO.



	After looking at some of the generated TGSI instructions, I think the issue is already visible there.
	The IMM array apparently contains (among other things) buffer offsets for transfering values from the local "result" variable (part of TEMP in TGSI) into the buffer:
	IMM[6] UINT32 {168, 164, 160, 36}
	IMM[7] UINT32 {32, 28, 24, 20}
	IMM[8] UINT32 {16, 12, 8, 4}
	Correct offsets for a[] values are 0,4,8,12,16,20,24,28,32,36, as reflected by IMM[6-8].
	But the next offset for b[] should be 40 bytes, while the IMM[] array only offers 160, i.e. a 120 bytes / 30 floats difference as seen above.


	*******/


	#include <string>
	#include <iostream>

	#include <unistd.h>

	#include <GL/glew.h>
	#include <SDL2/SDL.h>

	#define NUM_COEFF_SETS 6

	struct ResultRecord
	{
	float a[10];
	float b[10];
	float c[10];
	float weight;
	};


	// Works correctly when using vec4 instead of float (need to change the shader's struct definition and the glsl code, too)
	/*
	struct ResultRecord
	{
	float a[4*10];
	float b[4*10];
	float c[4*10];
	float weight;
	float unusedWeightVectorParts[3];
	};
	*/


	std::string shaderSrc = R"(

	#version 450
	#define OPENGL
	#define OPENGL450

	layout(local_size_x = 6, local_size_y = 1, local_size_z = 1) in;

	struct ResultRecord
	{
	float a[10];
	float b[10];
	float c[10];
	float weight;
	};

	// Works correctly when using vec4 instead of float
	/*
	struct ResultRecord
	{
	vec4 a[10];
	vec4 b[10];
	vec4 c[10];
	vec4 weight;
	};
	*/

	layout(std430) buffer gOutput
	{
	ResultRecord ssb[];
	};

	void main()
	{
	// Fails when using the intermediate "result" variable
	ResultRecord result;
	for (int i=0; i<10; i++) {
	result.a[i] = 0;
	result.b[i] = 0;
	result.c[i] = 0;
	}
	result.weight = gl_LocalInvocationIndex;
	result.a[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	result.b[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	result.c[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	// This fails:
	ssb[gl_LocalInvocationIndex] = result;

	// This works as a replacement for the previous line:
	/*
	ssb[gl_LocalInvocationIndex].a = result.a;
	ssb[gl_LocalInvocationIndex].b = result.b;
	ssb[gl_LocalInvocationIndex].c = result.c;
	ssb[gl_LocalInvocationIndex].weight = result.weight;
	*/

	// Works as expected when assigning values directly to ssb as a replacement for all the code above this:
	/*
	ssb[gl_LocalInvocationIndex].weight = gl_LocalInvocationIndex;
	ssb[gl_LocalInvocationIndex].a[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	ssb[gl_LocalInvocationIndex].b[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	ssb[gl_LocalInvocationIndex].c[gl_LocalInvocationIndex] = gl_LocalInvocationIndex;
	*/

	// Also works as expected when using vec4 instead of float as a replacement for all the code above this:
	/*
	ResultRecord result;
	for (int i=0; i<10; i++) {
	result.a[i] = vec4(0);
	result.b[i] = vec4(0);
	result.c[i] = vec4(0);
	}
	result.weight = vec4(gl_LocalInvocationIndex,0,0,0);
	result.a[gl_LocalInvocationIndex].x = gl_LocalInvocationIndex;
	result.b[gl_LocalInvocationIndex].x = gl_LocalInvocationIndex;
	result.c[gl_LocalInvocationIndex].x = gl_LocalInvocationIndex;
	ssb[gl_LocalInvocationIndex] = result;
	*/
	}

	)"; //// End of shader

	int main(int argc, char* argv[]) {
	if (SDL_Init(SDL_INIT_VIDEO) < 0) {
	std::cout << "Failed to init SDL\n";
	return 1;
	}

	SDL_Window *window = SDL_CreateWindow("Compute Shader Test", 0, 0, 200, 50, SDL_WINDOW_OPENGL\|SDL_WINDOW_RESIZABLE);
	SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
	SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
	SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
	SDL_GLContext glcontext = SDL_GL_CreateContext(window);

	glewInit();
	sleep(1);
	SDL_GL_SwapWindow(window);

	// Prepare shader program and pipeline
	const char* shaderSrcChars = shaderSrc.c_str();
	GLuint csProgramId = glCreateShaderProgramv(GL_COMPUTE_SHADER, 1, &shaderSrcChars);
	GLuint pipelineId = 0;
	glGenProgramPipelines(1, &pipelineId);
	glUseProgramStages(pipelineId, GL_COMPUTE_SHADER_BIT, csProgramId);
	glBindProgramPipeline(pipelineId);

	// Prepare SSBO (10x the needed size to make overflows visible)
	GLuint ssboId = 0;
	glGenBuffers(1, &ssboId);
	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboId);
	glBufferData(GL_SHADER_STORAGE_BUFFER, 10 * NUM_COEFF_SETS * sizeof(ResultRecord), nullptr, GL_DYNAMIC_DRAW);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssboId);
	glShaderStorageBlockBinding(csProgramId, 0, 0);

	// let the magic happen
	glDispatchCompute(1, 1, 1);
	SDL_GL_SwapWindow(window);

	// read the shader storage buffer and print what's stored as weight
	ResultRecord ssb[NUM_COEFF_SETS];
	glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(ssb), &ssb);

	for (int i=0; i<NUM_COEFF_SETS; i++) {
	std::cout << i << ": weight " << ssb[i].weight << " (should be " << i << ")" << std::endl;
	}

	sleep(1);
	SDL_GL_DeleteContext(glcontext);
	}