Skip to content

Instantly share code, notes, and snippets.

@reduz
Created December 4, 2023 12:35
Rendering Device Threads

The texture_create functions should be valid to call from any thread. The idea is that the setup command buffer, when dedicated transfer queues exist, is only used for creating and initializing resources. As such this process of creation / initialization should use a dedicated queue in order to:

  • Avoid unnecesary lockings in all the API
  • Improve performance (no need to slice up the texture in chunks)
  • Avoid saturating the staging buffer, which generally remains small, with initialization data, forcing it to grow to the max size and then and flush.

Anyway, these should be thread safe and use the dedicated transfer queue:

	virtual RID texture_create(const TextureFormat &p_format, const TextureView &p_view, const Vector<Vector<uint8_t>> &p_data = Vector<Vector<uint8_t>>()) = 0;
	virtual RID texture_create_shared(const TextureView &p_view, RID p_with_texture) = 0;
	virtual RID texture_create_from_extension(TextureType p_type, DataFormat p_format, TextureSamples p_samples, BitField<RenderingDevice::TextureUsageBits> p_flags, uint64_t p_image, uint64_t p_width, uint64_t p_height, uint64_t p_depth, uint64_t p_layers) = 0;

	virtual RID texture_create_shared_from_slice(const TextureView &p_view, RID p_with_texture, uint32_t p_layer, uint32_t p_mipmap, uint32_t p_mipmaps = 1, TextureSliceType p_slice_type = TEXTURE_SLICE_2D, uint32_t p_layers = 0) = 0;

The rest of the texture functions should NOT work on threads and have a thread guard, and do ERR_FAIL* macro if called from anything other than the render thread.

	virtual Error texture_update(RID p_texture, uint32_t p_layer, const Vector<uint8_t> &p_data, BitField<BarrierMask> p_post_barrier = BARRIER_MASK_ALL_BARRIERS) = 0;
	virtual Vector<uint8_t> texture_get_data(RID p_texture, uint32_t p_layer) = 0; // CPU textures will return immediately, while GPU textures will most likely force a flush

	virtual bool texture_is_format_supported_for_usage(DataFormat p_format, BitField<RenderingDevice::TextureUsageBits> p_usage) const = 0;
	virtual bool texture_is_shared(RID p_texture) = 0;
	virtual bool texture_is_valid(RID p_texture) = 0;
	virtual TextureFormat texture_get_format(RID p_texture) = 0;
	virtual Size2i texture_size(RID p_texture) = 0;
	virtual uint64_t texture_get_native_handle(RID p_texture) = 0;

	virtual Error texture_copy(RID p_from_texture, RID p_to_texture, const Vector3 &p_from, const Vector3 &p_to, const Vector3 &p_size, uint32_t p_src_mipmap, uint32_t p_dst_mipmap, uint32_t p_src_layer, uint32_t p_dst_layer, BitField<BarrierMask> p_post_barrier = BARRIER_MASK_ALL_BARRIERS) = 0;
	virtual Error texture_clear(RID p_texture, const Color &p_color, uint32_t p_base_mipmap, uint32_t p_mipmaps, uint32_t p_base_layer, uint32_t p_layers, BitField<BarrierMask> p_post_barrier = BARRIER_MASK_ALL_BARRIERS) = 0;
	virtual Error texture_resolve_multisample(RID p_from_texture, RID p_to_texture, BitField<BarrierMask> p_post_barrier = BARRIER_MASK_ALL_BARRIERS) = 0;

Creating framebuffers should be safe in threads.

	// This ID is warranted to be unique for the same formats, does not need to be freed
	virtual FramebufferFormatID framebuffer_format_create(const Vector<AttachmentFormat> &p_format, uint32_t p_view_count = 1) = 0;
	struct FramebufferPass {
		enum {
			ATTACHMENT_UNUSED = -1
		};
		Vector<int32_t> color_attachments;
		Vector<int32_t> input_attachments;
		Vector<int32_t> resolve_attachments;
		Vector<int32_t> preserve_attachments;
		int32_t depth_attachment = ATTACHMENT_UNUSED;
		int32_t vrs_attachment = ATTACHMENT_UNUSED; // density map for VRS, only used if supported
	};

	virtual FramebufferFormatID framebuffer_format_create_multipass(const Vector<AttachmentFormat> &p_attachments, const Vector<FramebufferPass> &p_passes, uint32_t p_view_count = 1) = 0;
	virtual FramebufferFormatID framebuffer_format_create_empty(TextureSamples p_samples = TEXTURE_SAMPLES_1) = 0;
	virtual TextureSamples framebuffer_format_get_texture_samples(FramebufferFormatID p_format, uint32_t p_pass = 0) = 0;

	virtual RID framebuffer_create(const Vector<RID> &p_texture_attachments, FramebufferFormatID p_format_check = INVALID_ID, uint32_t p_view_count = 1) = 0;
	virtual RID framebuffer_create_multipass(const Vector<RID> &p_texture_attachments, const Vector<FramebufferPass> &p_passes, FramebufferFormatID p_format_check = INVALID_ID, uint32_t p_view_count = 1) = 0;
	virtual RID framebuffer_create_empty(const Size2i &p_size, TextureSamples p_samples = TEXTURE_SAMPLES_1, FramebufferFormatID p_format_check = INVALID_ID) = 0;
	virtual bool framebuffer_is_valid(RID p_framebuffer) const = 0;
	virtual void framebuffer_set_invalidation_callback(RID p_framebuffer, InvalidationCallback p_callback, void *p_userdata) = 0;

	virtual FramebufferFormatID framebuffer_get_format(RID p_framebuffer) = 0;

Creating samplers should be safe in threads (this is afaik already safe un Vulkan).

	virtual RID sampler_create(const SamplerState &p_state) = 0;
	virtual bool sampler_is_format_supported_for_filter(DataFormat p_format, SamplerFilter p_sampler_filter) const = 0;

Vertex buffer / format / array creation (with data initialization) should work on threads.

	virtual RID vertex_buffer_create(uint32_t p_size_bytes, const Vector<uint8_t> &p_data = Vector<uint8_t>(), bool p_use_as_storage = false) = 0;

	typedef int64_t VertexFormatID;

	// This ID is warranted to be unique for the same formats, does not need to be freed
	virtual VertexFormatID vertex_format_create(const Vector<VertexAttribute> &p_vertex_formats) = 0;
	virtual RID vertex_array_create(uint32_t p_vertex_count, VertexFormatID p_vertex_format, const Vector<RID> &p_src_buffers, const Vector<uint64_t> &p_offsets = Vector<uint64_t>()) = 0;

	enum IndexBufferFormat {
		INDEX_BUFFER_FORMAT_UINT16,
		INDEX_BUFFER_FORMAT_UINT32,
	};

	virtual RID index_buffer_create(uint32_t p_size_indices, IndexBufferFormat p_format, const Vector<uint8_t> &p_data = Vector<uint8_t>(), bool p_use_restart_indices = false) = 0;
	virtual RID index_array_create(RID p_index_buffer, uint32_t p_index_offset, uint32_t p_index_count) = 0;

Shader compilation already works on threads, but should work on threads anyway.

	virtual bool has_feature(const Features p_feature) const = 0;

	virtual Vector<uint8_t> shader_compile_spirv_from_source(ShaderStage p_stage, const String &p_source_code, ShaderLanguage p_language = SHADER_LANGUAGE_GLSL, String *r_error = nullptr, bool p_allow_cache = true);
	virtual String shader_get_spirv_cache_key() const;

	static void shader_set_compile_to_spirv_function(ShaderCompileToSPIRVFunction p_function);
	static void shader_set_spirv_cache_function(ShaderCacheFunction p_function);
	static void shader_set_get_cache_key_function(ShaderSPIRVGetCacheKeyFunction p_function);

	struct ShaderStageSPIRVData {
		ShaderStage shader_stage;
		Vector<uint8_t> spir_v;

		ShaderStageSPIRVData() {
			shader_stage = SHADER_STAGE_VERTEX;
		}
	};

	virtual String shader_get_binary_cache_key() const = 0;
	virtual Vector<uint8_t> shader_compile_binary_from_spirv(const Vector<ShaderStageSPIRVData> &p_spirv, const String &p_shader_name = "") = 0;

	virtual RID shader_create_from_spirv(const Vector<ShaderStageSPIRVData> &p_spirv, const String &p_shader_name = "");
	virtual RID shader_create_from_bytecode(const Vector<uint8_t> &p_shader_binary, RID p_placeholder = RID()) = 0;
	virtual RID shader_create_placeholder() = 0;

	virtual uint64_t shader_get_vertex_input_attribute_mask(RID p_shader) = 0;

Buffer creation with initialization should work on threads.

	virtual RID uniform_buffer_create(uint32_t p_size_bytes, const Vector<uint8_t> &p_data = Vector<uint8_t>()) = 0;
	virtual RID storage_buffer_create(uint32_t p_size, const Vector<uint8_t> &p_data = Vector<uint8_t>(), BitField<StorageBufferUsage> p_usage = 0) = 0;
	virtual RID texture_buffer_create(uint32_t p_size_elements, DataFormat p_format, const Vector<uint8_t> &p_data = Vector<uint8_t>()) = 0;

Uniform set creation should work on threads.

	virtual RID uniform_set_create(const Vector<Uniform> &p_uniforms, RID p_shader, uint32_t p_shader_set) = 0;
	virtual bool uniform_set_is_valid(RID p_uniform_set) = 0;
	virtual void uniform_set_set_invalidation_callback(RID p_uniform_set, InvalidationCallback p_callback, void *p_userdata) = 0;

Buffer modification and retrieval should NOT work on threads. It needs to have a thread guard with error if called outside the render thread.

	virtual Error buffer_copy(RID p_src_buffer, RID p_dst_buffer, uint32_t p_src_offset, uint32_t p_dst_offset, uint32_t p_size, BitField<BarrierMask> p_post_barrier = BARRIER_MASK_ALL_BARRIERS) = 0;
	virtual Error buffer_update(RID p_buffer, uint32_t p_offset, uint32_t p_size, const void *p_data, BitField<BarrierMask> p_post_barrier = BARRIER_MASK_ALL_BARRIERS) = 0;
	virtual Error buffer_clear(RID p_buffer, uint32_t p_offset, uint32_t p_size, BitField<BarrierMask> p_post_barrier = BARRIER_MASK_ALL_BARRIERS) = 0;
	virtual Vector<uint8_t> buffer_get_data(RID p_buffer, uint32_t p_offset = 0, uint32_t p_size = 0) = 0; // This causes stall, only use to retrieve large buffers for saving.

Render and compute pipelines should allow creation on threads.

	virtual bool render_pipeline_is_valid(RID p_pipeline) = 0;
	virtual RID render_pipeline_create(RID p_shader, FramebufferFormatID p_framebuffer_format, VertexFormatID p_vertex_format, RenderPrimitive p_render_primitive, const PipelineRasterizationState &p_rasterization_state, const PipelineMultisampleState &p_multisample_state, const PipelineDepthStencilState &p_depth_stencil_state, const PipelineColorBlendState &p_blend_state, BitField<PipelineDynamicStateFlags> p_dynamic_state_flags = 0, uint32_t p_for_render_pass = 0, const Vector<PipelineSpecializationConstant> &p_specialization_constants = Vector<PipelineSpecializationConstant>()) = 0;

	virtual RID compute_pipeline_create(RID p_shader, const Vector<PipelineSpecializationConstant> &p_specialization_constants = Vector<PipelineSpecializationConstant>()) = 0;
	virtual bool compute_pipeline_is_valid(RID p_pipeline) = 0;

Draw and compute lists should have thread guards everywhere and return with error if called from anything other than the render thread.

  virtual DrawListID draw_list_begin_for_screen(DisplayServer::WindowID p_screen = 0, const Color &p_clear_color = Color()) = 0;
	virtual DrawListID draw_list_begin(RID p_framebuffer, InitialAction p_initial_color_action, FinalAction p_final_color_action, InitialAction p_initial_depth_action, FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth = 1.0, uint32_t p_clear_stencil = 0, const Rect2 &p_region = Rect2(), const Vector<RID> &p_storage_textures = Vector<RID>()) = 0;
	virtual Error draw_list_begin_split(RID p_framebuffer, uint32_t p_splits, DrawListID *r_split_ids, InitialAction p_initial_color_action, FinalAction p_final_color_action, InitialAction p_initial_depth_action, FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth = 1.0, uint32_t p_clear_stencil = 0, const Rect2 &p_region = Rect2(), const Vector<RID> &p_storage_textures = Vector<RID>()) = 0;

	virtual void draw_list_set_blend_constants(DrawListID p_list, const Color &p_color) = 0;
	virtual void draw_list_bind_render_pipeline(DrawListID p_list, RID p_render_pipeline) = 0;
	virtual void draw_list_bind_uniform_set(DrawListID p_list, RID p_uniform_set, uint32_t p_index) = 0;
	virtual void draw_list_bind_vertex_array(DrawListID p_list, RID p_vertex_array) = 0;
	virtual void draw_list_bind_index_array(DrawListID p_list, RID p_index_array) = 0;
	virtual void draw_list_set_line_width(DrawListID p_list, float p_width) = 0;
	virtual void draw_list_set_push_constant(DrawListID p_list, const void *p_data, uint32_t p_data_size) = 0;

	virtual void draw_list_draw(DrawListID p_list, bool p_use_indices, uint32_t p_instances = 1, uint32_t p_procedural_vertices = 0) = 0;

	virtual void draw_list_enable_scissor(DrawListID p_list, const Rect2 &p_rect) = 0;
	virtual void draw_list_disable_scissor(DrawListID p_list) = 0;

	virtual uint32_t draw_list_get_current_pass() = 0;
	virtual DrawListID draw_list_switch_to_next_pass() = 0;
	virtual Error draw_list_switch_to_next_pass_split(uint32_t p_splits, DrawListID *r_split_ids) = 0;

	virtual void draw_list_end(BitField<BarrierMask> p_post_barrier = BARRIER_MASK_ALL_BARRIERS) = 0;

	/***********************/
	/**** COMPUTE LISTS ****/
	/***********************/

	typedef int64_t ComputeListID;

	virtual ComputeListID compute_list_begin(bool p_allow_draw_overlap = false) = 0;
	virtual void compute_list_bind_compute_pipeline(ComputeListID p_list, RID p_compute_pipeline) = 0;
	virtual void compute_list_bind_uniform_set(ComputeListID p_list, RID p_uniform_set, uint32_t p_index) = 0;
	virtual void compute_list_set_push_constant(ComputeListID p_list, const void *p_data, uint32_t p_data_size) = 0;
	virtual void compute_list_dispatch(ComputeListID p_list, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) = 0;
	virtual void compute_list_dispatch_threads(ComputeListID p_list, uint32_t p_x_threads, uint32_t p_y_threads, uint32_t p_z_threads) = 0;
	virtual void compute_list_dispatch_indirect(ComputeListID p_list, RID p_buffer, uint32_t p_offset) = 0;
	virtual void compute_list_add_barrier(ComputeListID p_list) = 0;

	virtual void compute_list_end(BitField<BarrierMask> p_post_barrier = BARRIER_MASK_ALL_BARRIERS) = 0;

	virtual void barrier(BitField<BarrierMask> p_from = BARRIER_MASK_ALL_BARRIERS, BitField<BarrierMask> p_to = BARRIER_MASK_ALL_BARRIERS) = 0;
	virtual void full_barrier() = 0;

Free should not work from threads. Also should be thread guarded.

	/***************/
	/**** FREE! ****/
	/***************/

	virtual void free(RID p_id) = 0;

The timestamp API also should be thread guarded too and throw error if used from threads.

	virtual void capture_timestamp(const String &p_name) = 0;
	virtual uint32_t get_captured_timestamps_count() const = 0;
	virtual uint64_t get_captured_timestamps_frame() const = 0;
	virtual uint64_t get_captured_timestamp_gpu_time(uint32_t p_index) const = 0;
	virtual uint64_t get_captured_timestamp_cpu_time(uint32_t p_index) const = 0;
	virtual String get_captured_timestamp_name(uint32_t p_index) const = 0;

The Limits API is fine to work from threads (since this is often used also when compiled pipelines and shaders).

	/****************/
	/**** LIMITS ****/
	/****************/

	enum Limit {
		LIMIT_MAX_BOUND_UNIFORM_SETS,
		LIMIT_MAX_FRAMEBUFFER_COLOR_ATTACHMENTS,
		LIMIT_MAX_TEXTURES_PER_UNIFORM_SET,
		LIMIT_MAX_SAMPLERS_PER_UNIFORM_SET,
		LIMIT_MAX_STORAGE_BUFFERS_PER_UNIFORM_SET,
		LIMIT_MAX_STORAGE_IMAGES_PER_UNIFORM_SET,
		LIMIT_MAX_UNIFORM_BUFFERS_PER_UNIFORM_SET,
		LIMIT_MAX_DRAW_INDEXED_INDEX,
		LIMIT_MAX_FRAMEBUFFER_HEIGHT,
		LIMIT_MAX_FRAMEBUFFER_WIDTH,
		LIMIT_MAX_TEXTURE_ARRAY_LAYERS,
		LIMIT_MAX_TEXTURE_SIZE_1D,
		LIMIT_MAX_TEXTURE_SIZE_2D,
		LIMIT_MAX_TEXTURE_SIZE_3D,
		LIMIT_MAX_TEXTURE_SIZE_CUBE,
		LIMIT_MAX_TEXTURES_PER_SHADER_STAGE,
		LIMIT_MAX_SAMPLERS_PER_SHADER_STAGE,
		LIMIT_MAX_STORAGE_BUFFERS_PER_SHADER_STAGE,
		LIMIT_MAX_STORAGE_IMAGES_PER_SHADER_STAGE,
		LIMIT_MAX_UNIFORM_BUFFERS_PER_SHADER_STAGE,
		LIMIT_MAX_PUSH_CONSTANT_SIZE,
		LIMIT_MAX_UNIFORM_BUFFER_SIZE,
		LIMIT_MAX_VERTEX_INPUT_ATTRIBUTE_OFFSET,
		LIMIT_MAX_VERTEX_INPUT_ATTRIBUTES,
		LIMIT_MAX_VERTEX_INPUT_BINDINGS,
		LIMIT_MAX_VERTEX_INPUT_BINDING_STRIDE,
		LIMIT_MIN_UNIFORM_BUFFER_OFFSET_ALIGNMENT,
		LIMIT_MAX_COMPUTE_SHARED_MEMORY_SIZE,
		LIMIT_MAX_COMPUTE_WORKGROUP_COUNT_X,
		LIMIT_MAX_COMPUTE_WORKGROUP_COUNT_Y,
		LIMIT_MAX_COMPUTE_WORKGROUP_COUNT_Z,
		LIMIT_MAX_COMPUTE_WORKGROUP_INVOCATIONS,
		LIMIT_MAX_COMPUTE_WORKGROUP_SIZE_X,
		LIMIT_MAX_COMPUTE_WORKGROUP_SIZE_Y,
		LIMIT_MAX_COMPUTE_WORKGROUP_SIZE_Z,
		LIMIT_MAX_VIEWPORT_DIMENSIONS_X,
		LIMIT_MAX_VIEWPORT_DIMENSIONS_Y,
		LIMIT_SUBGROUP_SIZE,
		LIMIT_SUBGROUP_MIN_SIZE,
		LIMIT_SUBGROUP_MAX_SIZE,
		LIMIT_SUBGROUP_IN_SHADERS, // Set flags using SHADER_STAGE_VERTEX_BIT, SHADER_STAGE_FRAGMENT_BIT, etc.
		LIMIT_SUBGROUP_OPERATIONS,
		LIMIT_VRS_TEXEL_WIDTH,
		LIMIT_VRS_TEXEL_HEIGHT,
	};

	virtual uint64_t limit_get(Limit p_limit) const = 0;

	//methods below not exposed, used by RenderingDeviceRD
	virtual void prepare_screen_for_drawing() = 0;

	virtual void swap_buffers() = 0;

	virtual uint32_t get_frame_delay() const = 0;

	virtual void submit() = 0;
	virtual void sync() = 0;

	enum MemoryType {
		MEMORY_TEXTURES,
		MEMORY_BUFFERS,
		MEMORY_TOTAL
	};

	virtual uint64_t get_memory_usage(MemoryType p_type) const = 0;

The local devices should store as the render thread the thread that creates them, not the render thread. Can obtain this with Thread::get_current() I think.

	virtual RenderingDevice *create_local_device() = 0;

The debug functions should be all thread guarded and only work from render thread.

	virtual void set_resource_name(RID p_id, const String p_name) = 0;

	virtual void draw_command_begin_label(String p_label_name, const Color p_color = Color(1, 1, 1, 1)) = 0;
	virtual void draw_command_insert_label(String p_label_name, const Color p_color = Color(1, 1, 1, 1)) = 0;
	virtual void draw_command_end_label() = 0;

Info functions should work fine already from threads.

	virtual String get_device_vendor_name() const = 0;
	virtual String get_device_name() const = 0;
	virtual RenderingDevice::DeviceType get_device_type() const = 0;
	virtual String get_device_api_version() const = 0;
	virtual String get_device_pipeline_cache_uuid() const = 0;

Get driver native resource should only work from main thread (since not everything it accessed is thread guarded).

	virtual uint64_t get_driver_resource(DriverResource p_resource, RID p_rid = RID(), uint64_t p_index = 0) = 0;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment