reduz/rendering_device_v2.h Secret

## rendering_device_v2.h
/*******************************/
/* RenderingDevice API Changes */
/*******************************/

// All usages of BarrierMask removed, barriers no longer possible to specify manually.
// compute_list_barrier() must remain, though to indicate user wants to wait for a pass to finish before the next one.

/**********************/
/* framebuffer format */
/**********************/

struct AttachmentFormat {
	enum Usage {
		USAGE_RENDER_TARGET,
		USAGE_INPUT,
		USAGE_UNUSED
	};
	Usage usage;
	DataFormat format;
	TextureSamples samples;
	AttachmentFormat() {
		usage = USAGE_RENDER_TARGET;
		format = DATA_FORMAT_R8G8B8A8_UNORM;
		samples = TEXTURE_SAMPLES_1;
	}
};

typedef int64_t FramebufferFormatID;

// Changes:
// - Subpasses are gone
// - Input is specified directly

// This ID is warranted to be unique for the same formats, does not need to be freed
virtual FramebufferFormatID framebuffer_format_create(const Vector<AttachmentFormat> &p_format, uint32_t p_view_count = 1) = 0;
virtual FramebufferFormatID framebuffer_format_create_empty(TextureSamples p_samples = TEXTURE_SAMPLES_1) = 0;
virtual TextureSamples framebuffer_format_get_texture_samples(FramebufferFormatID p_format, uint32_t p_pass = 0) = 0;

// Input mask set to 1 to signal this attachment is used for input attachments.
virtual RID framebuffer_create(const Vector<RID> &p_texture_attachments, uint64_t p_input_mask=0, FramebufferFormatID p_format_check = INVALID_ID, uint32_t p_view_count = 1) = 0;
virtual RID framebuffer_create_empty(const Size2i &p_size, TextureSamples p_samples = TEXTURE_SAMPLES_1, FramebufferFormatID p_format_check = INVALID_ID) = 0;
virtual bool framebuffer_is_valid(RID p_framebuffer) const = 0;
virtual void framebuffer_set_invalidation_callback(RID p_framebuffer, InvalidationCallback p_callback, void *p_userdata) = 0;
virtual FramebufferFormatID framebuffer_get_format(RID p_framebuffer) = 0;

/***************/
/* uniform set */
/***************/

// Changes:
// - Removal of SamplerWithTexture, TextureBuffer, SamplerWithTextureBuffer. Makes portability easier.

enum UniformType {
	UNIFORM_TYPE_SAMPLER, //for sampling only (sampler GLSL type)
	UNIFORM_TYPE_TEXTURE, //only texture, (textureXX GLSL type)
	UNIFORM_TYPE_IMAGE, // storage image (imageXX GLSL type), for compute mostly
	UNIFORM_TYPE_UNIFORM_BUFFER, //regular uniform buffer (or UBO).
	UNIFORM_TYPE_STORAGE_BUFFER, //storage buffer ("buffer" qualifier) like UBO, but supports storage, for compute mostly
	UNIFORM_TYPE_INPUT_ATTACHMENT, //used for sub-pass read/write, for mobile mostly
	UNIFORM_TYPE_MAX
};

/*********************************/
/* render pass framebuffer usage */
/*********************************/


enum DrawListBufferLoadAction {
	DRAW_LIST_LOAD_ACTION_LOAD,
	DRAW_LIST_LOAD_ACTION_CLEAR,
	DRAW_LIST_LOAD_ACTION_DONT_CARE,
};

enum DrawListBufferStoreAction {
	DRAW_LIST_STORE_ACTION_STORE,
	DRAW_LIST_STORE_ACTION_RESOLVE,
	DRAW_LIST_STORE_ACTION_DISCARD,
};

// Changes:
// - Ability to better fineagrain clear/discard per color buffer. This was missing before.
// - No longer specify initial/final action, only clear and discard.
// - No longer specify storage textures, this can now be inferred automatically by the barrier solver.
// - No longer specify subpasses, this can now be inferred automatically by the subpass solver.


virtual DrawListID draw_list_begin(RID p_framebuffer, const Vector<DrawListLoadAction> &p_buffer_load_actions, const Vector<DrawListStoreAction> &p_buffer_store_actions, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth = 1.0, uint32_t p_clear_stencil = 0, const Rect2 &p_region = Rect2()) = 0;

/*****************/
/* Acyclic Graph */
/*****************/

/*
 * The acyclic graph solver kicks off after the render thread finished sending the draw commands.
 * It can optionally run asynchronously (post a semaphore that starts the solving and submission).
 */

// The base idea of how this acyclic graph works a bit based on this document:
// https://levelup.gitconnected.com/organizing-gpu-work-with-directed-acyclic-graphs-f3fd5f2c2af3
//
// So Its good to read it first. There are, however, many things that will work different due to how Godot works.
// The main difference is that, in Godot, you don't use a manually created render graph (like in the API design above).
// The idea is that, inside RenderingDevice, this render graph will be created automatically instead based on the commands
// supplied, in a way similar to how Direct3D11 drivers work, but giving us far more control on how to do the process.
//
// In RenderingDevice, the idea is that anything you do (that is not _creating_ a resource) can be considered a _command_ , such as:
//
//
// * Updating a buffer.
// * Copying a buffer
// * Clearing a buffer
// * Updating a texture
// * Copying a texture
// * Clearing a texture
// * Perform a render list (from draw_list_begin to draw_list end, including all draw/bind calls are all ONE single command).
// * Perform a compute list ( all the dispatch calls are one command, calling add_barrier switches to a new command).
// * etc.
//
//
// The rule here is that, in a command, we always start from a valid state and end in a valid state. It is impossible to do something invalid (if the command arguments are invalid, this is errored before the command is submitted).
//
// Commands will take resources (buffers and textures), which will be used for either reading or writing into them. As an example:
// * A texture used in a shader (descriptor set) as sampler, only reads.
// * A texture used in a shader (descriptor set) as image, may read or write. We need to check the SPIRV reflection to see if its readonly.
// * A buffer used as UBO will only read in a shader, but SBO may read or write depending on usage.
//
// As example, here are all the resource usage types that may happen in commands for each resource:
//
// * Transfer from (texture or buffer copy source)
// * Transfer to (texture or buffer update or copy destination)
// * Transfer from/to (read/write from same)
// * Uniform buffer read (UBO)
// * Indirect buffer read (dispatch or draw indirect)
// * Storage buffer write
// * Storage buffer read
// * Storage buffer read/write
// * Vertex buffer read
// * Index buffer read
// * Texture sample
// * Storage image read
// * Storage image write
// * Storage image read/write
// * Texture render target write
// * Texture input attachment read
// * etc.
//
// Every command needs to track the resources used and what it does with them (the usage type) when building the render graph,
// this allows to later create barrier transitions accordingly.
//
// So ultimately the graph creation happens with this pseudocode:


RenderingDevice::do_command(RID p_resource) {

	Command command = allocate_command();
	command.type = something
	command.data = fillwithsomething

	command_add_resource_dependency( command, p_resource, usage_type ); // usage_type is one from the bullet points above)

	command_add_to_list( command );

}

// The key here is the command_add_resource_dependency function, which tracks dependencies and usage types.
// command_add_resource_dependency needs to following a certain logic, here with pseudocode:


RenderingDevice::command_add_resource_dependency(Command p_command, RID p_resource, UsageType p_usage_type) {


	if ( resource_already_processed_for_command( p_resource, p_command) ) {
		if (resource_get_previous_usage(p_resource) != p_usage_type) {
			return error; // Cant use the same resource for two differnt things in a command.
		}
		return ok;
	}

	bool is_write usage = is_write_usage( p_usage_type ); // Determine if this usage type is a write

	if (!write_usage && is_usage_read(p_usage_type)) {
		// Pure read
		UsageType previous_usage = resource_get_previous_usage(p_resource);

		if (!is_write_usage(previous_usage) && previous_usage != p_usage_type) {
			// If the previous usage was a read, but the type of read is different,
			// also consider a write usage (due to layout change).
			write_usage=true;
		}
	}


	if (write_usage) {
		// Writing, insert a dependecy
		if (resource_read_commands_exist_in_frame(p_resource) {
			// Resource has been read before, so we need to insert a dependency from those commands to this one.
			for (read_command in resource_read_commands_get(p_resource)) {
				command_add_dependency(read_command, p_command);
			}
		} else if ( resource_write_command_exists_in_frame(p_resource) ) {
			// Resoucre has been written before, we need to insert a dependency from previous write command to this one.
			command_add_dependency( resource_get_write_commmand(p_resource), p_command );
		}

		resource_set_write_command( p_resource, p_command); // used with resource_write_command_exists_in_frame and resource_get_write_commmand
		resource_clear_read_command( p_resource); // This way, resource_read_commands_exist_in_frame will return false.
	} else {
		// Reading
		if ( resource_write_command_exists_in_frame(p_resource) ) {
			// There is a command that wrote to this resource in this frame, insert a dependency to this one
			command_add_dependency( resource_get_write_commmand(p_resource), p_command );
		}

		resource_add_read_command( p_resource, p_command ); // used with resource_read_commands_exist_in_frame and resource_read_commands_get
	}

	resource_set_previous_usage( p_resource, p_usage); // will be used with resource_get_previous_usage above.
	command_add_resource_barrier_usage( p_command, p_resource, p_usage); // Added to the list of barriers needed before executing this command. Also makes sure resource_already_processed_for_command returns true if called with same resource and command.

	return ok;
}

// The general idea is that dependencies to commands depending on resource usage are more or less like this:

//                   [read commands]
//                 /                 \
// [write_command] -> [read commands] -> [write command]
//                 \                 /
//                   [read commands]

// This allows the read commands to happen in parallel.

// The function above is implemented again in more detail below, this pseudocode is mainly intended so its clear how it works.

// Some implementation nodes:

// command_add_dependency() must ensure that the dependency is only added once.


/*************************/
/** CODE IMPLEMENTATION **/
/*************************/

// This is a more in detail code implementation of the algorithm below, including graph building with resulting levels.


// Textures and Buffers have a pointer to these

struct ResourceTracker {
	enum Type {
		TYPE_TEXTURE,
		TYPE_BUFFER
	};

	enum Usage {
		USAGE_NONE,
		USAGE_TRANSFER_FROM,
		USAGE_TRANSFER_TO,
		USAGE_UNIFORM_BUFFER,
		USAGE_INDIRECT_BUFFER,
		USAGE_STORAGE_BUFFER_WRITE_BUFFER, // Storage buffer usage can be read from SPIRV reflection
		USAGE_STORAGE_BUFFER_READ_BUFFER,
		USAGE_STORAGE_BUFFER_READ_WRITE_BUFFER,
		USAGE_VERTEX_BUFFER,
		USAGE_INDEX_BUFFER,
		USAGE_SAMPLING_TEXTURE,
		USAGE_STORAGE_IMAGE_READ_TEXTURE, // Image usage can be read from SPIRV reflection
		USAGE_STORAGE_IMAGE_WRITE_TEXTURE,
		USAGE_STORAGE_IMAGE_READ_WRITE_TEXTURE,
		USAGE_RENDER_TARGET_TEXTURE,
		USAGE_INPUT_ATTACHMENT_TEXTURE,
		// Missing stuff but you get the idea
	};

	Type type;

	// Used to see if the command has processed this already and skip it
	uint64_t current_command_frame = 0; // frame this last command was used as.
	uint64_t current_command_buffer_offset = 0; // buffer offset of the command that last used this.

	int64_t current_write_command_offset; // Offset to command that performed write, or -1 if nobody wrote in this frame
	int32_t read_command_list; // Index to linked list (command_read_dependencies) of commands that performed read.

	Usage current_usage = USAGE_NONE;

	union {
		// These also have a pointer back to ResourceTracker
		Texture *texture;
		Buffer *buffer;
	};
};


// ResourceTracker objects are allocated and packed in a pool (PoolAllocator) for cache efficiency.

//

// All command buffer commands are accumulated in a buffer while the render thread submits them.

struct ResourceUsage {
	ResourceTracker *base_resource;
	ResourceTracker::Usage intended_usage; // used for barrier
};

LocalVector<ResourceUsage> used_resources; // LocalVector uses worst-case allocator, so clearing and re-filing has no cost.
uint32_t used_resource_offset = 0;

struct CommandDependency {
	int32_t next; // -1 means no dependency
	uint64_t dependency_offset;
};

LocalVector<CommandDependency> command_dependencies;
LocalVector<CommandDependency> command_read_dependencies; // used by ResourceTracker

struct Command {
	// Avoid using anything that needs to be constructed in here and derivated classes. Use pointers to resources not RIDs.
	enum Type {
		TYPE_BUFFER_UPDATE,
		TYPE_BUFFER_COPY,
		TYPE_BUFFER_CLEAR,
		TYPE_TEXTURE_UPDATE,
		TYPE_TEXTURE_COPY,
		TYPE_TEXTURE_CLEAR,
		TYPE_RENDER_LIST,
		TYPE_COMPUTE_LIST, // Not just the compute list, but every time compute_list_barrier() is called, a new one of these is created and used.
	};

	Type type;
	uint32_t size; // based on size of actual inherited class
	uint32_t index;

	uint32_t used_resource_from; // index to used_resources
	uint32_t used_resource_amount; // amount of used resources

	bool graph_processed; // false by default, Set to true after the graph has processed it, this avoids us from processing it twice.

	int32_t dependency_list; // -1 default, points to command_dependencies linked list.

	int32_t current_dependency; // -1 by default, used to avoid adding a dependency twice.

	Command *level_list; // ONLY this one can be a pointer because at this stage no new commands will be added, nullptr by default

};

struct CommandBufferUpdate : public Command {

	// Saves the start buffer index and offset in staging buffers (this is data is copied on call to RenderingDevice::Call)
	uint32_t staging_buffer_index;
	uint32_t staging_buffer_offset;
	RenderingDeviceDriver::BufferID target_buffer_index;
	uint64_t target_buffer_offset;
	uint64_t target_buffer_offset_size;
};

struct CommandTextureUpdate : public Command {
	// Similar to CommandBufferUpdate

	virtual CommandTextureUpdate() { type = TYPE_TEXTURE_UPDATE; }

};

LocalVector<uint8_t> command_buffer; // use and expand on demand. This is why we use offsets to commands, not pointers.
uint64_t command_buffer_offset;


// Commands are allocated into a LocalVector<uint8_t> using in_place allocation with sizeof() (not memnew_placement for performance)

/**** How to use this? ****/

// Following pseudocode should give you an idea

template<class T>
T* RenderingDevice::_allocate_command() {

	command_bufer.resize( command_buffer.size() + sizeof(T) );

	//ensure enough space before
	T * c = (T *)&command_bufer[command_buffer_offset];


	// Fill Command fields
	c->size = sizeof(T);
	c->index = 0;
	c->used_resource_from = used_resource_offset;
	c->used_resource_amount = 0;
	c->dependency_list = -1;
	c->current_dependency = -1;
	c->graph_processed = 0;
	c->command_dependency_amount = 0;
	c->level_list = nullptr;

	return c;
}

void RenderingDevice::buffer_update(RID to_buffer,source_data,etc) {

	//ensure enough space before
	CommandBufferUpdate * update = _allocate_command<CommandBufferUpdate>();

	// Fill CommandBufferUpdate fields
	update->type = Command::TYPE_BUFFER_UPDATE;
	// .. //
	// Actually copy source data to the staging buffer.

	// Fill ResourceUsage
	ResourceTracker * buffer = obtain_from_to_buffer( to_buffer );

	_command_add_resource_dependency(update,buffer,ResourceTracker::USAGE_TRANSFER_TO); // See code below

	command_buffer_offset += update->size;

}

// This is the most critical case, setting descriptor sets, since its called a lot (well, mostly once per material)

void RenderingDevice::draw_list_bind_uniform_set(DrawListID p_draw_list, RID p_uniform_set) {

	//ensure enough space before
	CommandDrawList * draw_list = /* this allocated on draw_list_begin already */;

	UniformSet *uniform_set = /* obtain fro p_uniform_set */

	for(uint32_t i=0;i<uniform_set->dependency_tracker_count;i++) {
		// Add the dependency trackers.
		// Keep in mind that the following makes this efficient in the long term:
		// - Addition only happens ONCE per draw list (otherwise its just marked as added and it won't be added again).
		// - Order happens (or should happen) by material, so this descriptor set is only set once by the renderer.
		// - Multiple objects using the same material do not need to call this every time.
		// - Eventually, the idea is to move most of the static geometry to bindless (texture streaming), so this code will not be called at all.
		bool success = _command_add_resource_dependency(draw_list,uniform_set->dependency_trackers[i].tracker,uniform_set->dependency_trackers[i].usage);
#ifdef DEBUG_ENABLED
		ERR_FAIL_COND_MSG( !succes, "Attempt to supply a uniform set resource with a different usage than the one specified in a previous uniform set, or the draw list arguments.");
#endif
	}
}

// This should be inlined

_FORCE_INLINE_ bool RenderingDevice::_command_add_resource_dependency(Command * p_command, ResourceTracker *p_resource,ResourceTracker::Usage p_usage) {

	if (p_resource->current_command_frame == current_frame && p_resource->current_command_buffer_offset == command_buffer_offset) {
#ifdef DEBUG_ENABLED
		if (p_usage != p_resource->current_usage) {
			return false; // trying to use the resource and use it for two different things in the render pass.
		}
#endif
		return true; // Already added, do nothing.
	}

	if (p_resource->current_command_frame != current_frame) {
		// Resource not used in this frame, clean up
		p_resource->current_command_frame = current_frame;
		// Clean up write command and read list
		p_resource->current_write_command_offset = -1;
		p_resource->read_command_list = -1;


	}

	// Determine whether this is a write usage
	bool write_usage = _is_usage_write(p_usage);

	if (!write_usage && _is_usage_read(p_usage)) {
		// Pure read, means it can happen in parallel.
		if (!_is_write_usage(p_resource->current_usage) && p_resource->current_usage != p_usage) {
			// If the previous usage was a read, but the type of read is different,
			// also consider a write usage (due to layout change).
			write_usage=true;
		}
	}


	if (write_usage) {
		// Writing, insert a dependecy
		if (p_resource->read_command_list != -1) {
			// If there are read commands, insert dependencies to those.
			int32_t read_command = p_resource->read_command_list;
			while(read_command != -1) {
				CommandDependency *cd = &command_read_dependencies[read_command];
				_command_add_command_dependency(cd->dependency_offset);
				read_command = cd->next;
			}
			// Clear read command list, we are writing.
			p_resource->read_command_list = -1;
		} else if (p_resource->current_write_command_offset != -1){
			_command_add_command_dependency(p_resource->current_write_command_offset);
		}

		p_resource->current_write_command_offset = command_buffer_offset;

	} else {
		// Reading
		if (p_resource->current_write_command_offset != -1){
			// If there is a write command, insert a dependency to the write command.
			_command_add_command_dependency(p_resource->current_write_command_offset);
		}
		// Add to the read linked list
		CommandDependency cd;
		cd.next = p_resource->read_command_list;
		cd.dependency_offset = command_buffer_offset;
		p_resource->read_command_list = command_read_dependencies.size();
		command_read_dependencies.push_back(cd);
	}

	// Finally update the resource
	p_resource->current_command_buffer_offset = command_buffer_offset;
	p_resource->current_usage = p_usage;

	// Add to the used resources list
	ResourceUsage *res_usage = (ResourceUsage *)&used_resources[used_resource_offset++];
	res_usage->base_resource = p_resource;
	res_usage->intended_usage = p_usage;

	p_command->used_resource_amount++; // using ++ as reference, but in this command its only setting to 1


	return true;

}

_FORCE_INLINE_ void RenderingDevice::_command_add_command_dependency(uint64_t p_dependency_offset) {

	Command *dependency = (Command*)&command_bufer[p_dependency_offset];

	if (dependency->current_dependency == command_buffer_offset) {
		return; // Nothing to be done. Dependency already added.
	}

	dependency->current_dependency = command_buffer_offset;

	CommandDependency cd;
	cd.next = dependency->dependency_list;
	cd.dependency_offset = command_buffer_offset;
	dependency->dependency_list = command_dependencies.size();
	command_dependencies.push_back(cd);

}

/**** Acyclic graph solver ****/


// Following pseudocode solves graph

LocalVector<Command*> ordered_commands;
LocalVector<Command*> command_levels;
LocalVector<Command*> command_level_list;

//

void _process_graph() {

	ordered_commands.clear();
	uint64_t command_offset = 0;

	// Traverse graph
	while (command_offset < command_buffer_offset) {
		Command *command = (Command*)command_buffer[command_offset];
		_process_graph_command(command);
		command_offset += command->size;
	}

	// Must be inverted after graph traversal
	ordered_commands.invert();


	// Assign indices and base levels to each commant
	command levels.resize(ordered_commands.size());

	for(uint32_t i=0;i<ordered_commands.size();i++) {
		ordered_commands[i]->index = i;
		command_levels[i] = 0;
	}

	// Assign levels to each command

	uint32_t max_level = 0;

	for(uint32_t i=0;i<ordered_commands.size();i++) {
		Command *command = ordered_commands[i];
		int32_t dependency = p_command->dependency_list;
		uint32_t target_level = command_levels[i] + 1;

		while(dependency != -1) {
			CommandDependency *cd = &command_dependencies[dependency];
			Command *c = (Command*)command_buffer[cd->dependency_offset];
			uint32_t dep_index = c->index;

			if (command_levels[dep_index] < target_level) {
				command_levels[dep_index] = target_level;
				max_level = MAX(max_level,target_level);
			}

			dependency = cd->next;
		}
	}

	// Create the list of commands ordered by level
	command_level_list.resize(max_level + 1);
	for(uint32_t i = 0 ; i <command_level_list.size() ; i++) {
		command_level_list[i] = nullptr;
	}

	for(uint32_t i=0;i<ordered_commands.size();i++) {
		Command *command = ordered_commands[i];
		uint32_t level = command_levels[i];
		command->level_list = command_level_list[level];
		command_level_list[level] = command;
	}

	// There we go! Now create the actual RenderingDeviceDriver command buffers
	// A single barrier command can be emitted with everything the level does before
	// actually emitting the level commands.

}


void RenderingDevice::_process_graph_command(Command *p_command) {

	if (p_command->graph_processed) {
		return; // already processed
	}

	// Solve dependencies first
	p_command->graph_processed = true;

	// If there are read commands, insert dependencies to those.
	int32_t dependency = p_command->dependency_list;
	while(dependency != -1) {
		CommandDependency *cd = &command_dependencies[dependency];
		_process_graph_command( (Command*)command_buffer[cd->dependency_offset] );
		dependency = cd->next;
	}

	ordered_commands.push_back(p_command);

}

/**** RENDER / COMPUTE PASS PROCESSING ****/


// For render pass / compute pass processing you likely need a global command buffer using LocalVector

LocalVector<uint8_t> render_pass_command_buffer;

// These re defined in **RenderingDeviceDriver**
struct RenderPassCommand {
	enum Type {
		COMMAND_SET_BLEND_CONSTANTS,
		COMMAND_SET_BIND_RENDER_PIPELINE,
		COMMAND_SET_BIND_UNIFORM_SET,
		COMMAND_SET_BIND_VERTEX_ARRAY,
		COMMAND_SET_BIND_INDEX_ARRAY,
		COMMAND_SET_LINE_WIDTH,
		// etc.
	}
	Type type;
	// information
};

// example:

struct RenderPassCommandSetBlendConstants : public RenderPassCommand {
	Color color;
	RenderCommandSetBlendConstants() {
		type = COMMAND_SET_BLEND_CONSTANTS;
	}
};

// Internally, in RenderingDevice there is something like this, using the API mentioned before

struct CommandRenderPass : public Command {
	// Render pass variables, like viewportsize clear constants, etc.
	..
	//
	uint64_t render_pass_command_buffer_begin_offset;
	uint64_t render_pass_command_buffer_end_offset;
};

// Then, the API in RenderingDeviceDriver should simply remove all the virtual functions for the render pass and simply take something like

RenderingDeviceDriver::command_render_pass( <render_pass_args>, const uint8_t * p_render_command_buffer, uint64_t p_render_command_buffer_size);

// On the Godot side in RenderingDevice, these functions should be all inline instead:

virtual void draw_list_set_blend_constants(DrawListID p_list, const Color &p_color) = 0;
virtual void draw_list_bind_render_pipeline(DrawListID p_list, RID p_render_pipeline) = 0;
virtual void draw_list_bind_uniform_set(DrawListID p_list, RID p_uniform_set, uint32_t p_index) = 0;
virtual void draw_list_bind_vertex_array(DrawListID p_list, RID p_vertex_array) = 0;
virtual void draw_list_bind_index_array(DrawListID p_list, RID p_index_array) = 0;
virtual void draw_list_set_line_width(DrawListID p_list, float p_width) = 0;
virtual void draw_list_set_push_constant(DrawListID p_list, const void *p_data, uint32_t p_data_size) = 0;

// So the cost of filling the command buffer is minimal and can be done in a single function.

/**** Resource dependencies for draw_list_bind_uniform_set() ****/

// When creating a uniform_set, we must have a readily accessible list of dependencies it containts, because
// when calling draw_list_bind_uniform_set, they need to be processed in the acyclic graph.

// Having just an allocated array of dependencies is likely not going to be good cache wise, so we should use
// A dependency pool.

// The way I suggest doing this is using a PagedAllocator and each element can be something like this:

struct UniformSetDependencyList {
	uint64_t count;
	UniformSetDependencyList * next = nullptr;
	struct Tracker {
		ResourceTracker *resource;
		ResourceTracker::Usage usage;
	};
	constexpr uint32_t DEPENDENCY_TRACKER_MAX = // Calculate so the entire size of UniformSetDependencyList is po2 like 512, 1024, etc. Something between 512 and 4096 shuld be good for cache pages.
	Tracker dependency_trackers[DEPENDENCY_TRACKER_MAX];
};