Created
June 10, 2024 14:53
-
-
Save dfellis/79e3789e3fc3ea80580f334ec652404d to your computer and use it in GitHub Desktop.
wgpu buffer freeing bug reproduction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "alan_generated_bin" | |
edition = "2021" | |
[dependencies] | |
flume = "0.11.0" | |
futures = "0.3.30" | |
wgpu = { version = "0.20.0", features = ["trace"] } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// `i64toi32` casts an i64 to an i32. | |
#[inline(always)] | |
fn i64toi32(i: &i64) -> i32 { | |
*i as i32 | |
} | |
/// `filled` returns a filled Vec<V> of the provided value for the provided size | |
#[inline(always)] | |
fn filled<V: std::clone::Clone>(i: &V, l: &i64) -> Vec<V> { | |
vec![i.clone(); *l as usize] | |
} | |
/// `print_vec` pretty prints a vector assuming the input type can be displayed | |
#[inline(always)] | |
fn print_vec<A: std::fmt::Display>(vs: &Vec<A>) { | |
println!( | |
"[{}]", | |
vs.iter() | |
.map(|v| format!("{}", v)) | |
.collect::<Vec<String>>() | |
.join(", ") | |
); | |
} | |
struct GPU { | |
pub instance: wgpu::Instance, | |
pub adapter: wgpu::Adapter, | |
pub device: wgpu::Device, | |
pub queue: wgpu::Queue, | |
} | |
impl GPU { | |
pub fn new() -> Result<GPU, Box<dyn std::error::Error>> { | |
let instance = wgpu::Instance::default(); | |
let mut adapters = instance.enumerate_adapters(wgpu::Backends::all()); | |
// We're going to grab the first WebGPU-compliant adapter | |
adapters.reverse(); | |
// TODO: This crashes if there are literally zero options | |
let mut adapter = adapters.pop().unwrap(); | |
while adapters.len() > 0 { | |
if adapter.get_downlevel_capabilities().is_webgpu_compliant() { | |
break; | |
} | |
adapter = match adapters.pop() { | |
Some(a) => a, | |
None => { | |
// None of the adapters have the required WebGPU capabilities | |
// So let's try to request a fallback (software-supported) one | |
// to at least not crash | |
let fallback_options = wgpu::RequestAdapterOptions { | |
power_preference: wgpu::PowerPreference::None, | |
force_fallback_adapter: true, | |
compatible_surface: None, | |
}; | |
let adapter_future = instance.request_adapter(&fallback_options); | |
match futures::executor::block_on(adapter_future) { | |
Some(a) => a, | |
None => panic!("Unable to acquire an adapter"), | |
} | |
} | |
}; | |
} | |
// Temporarily override for software no matter what | |
// Let's ask the adapter for everything it can do | |
let features = adapter.features(); | |
let limits = adapter.limits(); | |
let device_future = adapter.request_device( | |
&wgpu::DeviceDescriptor { | |
label: None, | |
required_features: features, | |
required_limits: limits, | |
}, | |
Some(&std::env::current_dir()?), | |
); | |
let (device, queue) = futures::executor::block_on(device_future)?; | |
Ok(GPU { | |
instance, | |
adapter, | |
device, | |
queue, | |
}) | |
} | |
} | |
#[inline(always)] | |
fn GPU_new() -> GPU { | |
// TODO: Make this safer | |
match GPU::new() { | |
Ok(g) => g, | |
Err(_) => unreachable!(), | |
} | |
} | |
fn create_buffer_init( | |
g: &mut GPU, | |
usage: &mut wgpu::BufferUsages, | |
vals: &mut Vec<i32>, | |
) -> wgpu::Buffer { | |
let val_slice = &vals[..]; | |
let val_ptr = val_slice.as_ptr(); | |
let val_u8_len = vals.len() * 4; | |
let val_u8: &[u8] = unsafe { std::slice::from_raw_parts(val_ptr as *const u8, val_u8_len) }; | |
wgpu::util::DeviceExt::create_buffer_init( | |
&g.device, | |
&wgpu::util::BufferInitDescriptor { | |
label: None, // TODO: Add a label for easier debugging? | |
contents: val_u8, | |
usage: *usage, | |
}, | |
) | |
} | |
fn create_empty_buffer( | |
g: &mut GPU, | |
usage: &mut wgpu::BufferUsages, | |
size: &mut i64, | |
) -> wgpu::Buffer { | |
g.device.create_buffer(&wgpu::BufferDescriptor { | |
label: None, // TODO: Add a label for easier debugging? | |
size: *size as u64, | |
usage: *usage, | |
mapped_at_creation: false, // TODO: With `create_buffer_init` does this make any sense? | |
}) | |
} | |
// TODO: Either add the ability to bind to const values, or come up with a better solution. For | |
// now, just hardwire a few buffer usage types in these functions | |
#[inline(always)] | |
fn map_read_buffer_type() -> wgpu::BufferUsages { | |
wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST | |
} | |
#[inline(always)] | |
fn storage_buffer_type() -> wgpu::BufferUsages { | |
wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC | |
} | |
type Vec_Vec_Buffer<'a> = Vec<Vec<&'a wgpu::Buffer>>; | |
#[inline(always)] | |
fn Vec_Vec_Buffer_new<'a>() -> Vec_Vec_Buffer<'a> { | |
Vec::new() | |
} | |
struct GPGPU<'a> { | |
pub source: String, | |
pub entrypoint: String, | |
pub buffers: Vec_Vec_Buffer<'a>, | |
pub workgroup_sizes: [i64; 3], | |
} | |
impl GPGPU<'_> { | |
fn new<'a>( | |
source: String, | |
buffers: Vec_Vec_Buffer<'a>, | |
workgroup_sizes: [i64; 3], | |
) -> GPGPU<'a> { | |
GPGPU { | |
source, | |
entrypoint: "main".to_string(), | |
buffers, | |
workgroup_sizes, | |
} | |
} | |
} | |
#[inline(always)] | |
fn GPGPU_new<'a>(source: &mut String, buffers: &'a mut Vec_Vec_Buffer) -> GPGPU<'a> { | |
GPGPU::new(source.clone(), buffers.clone(), [1, 1, 1]) // TODO: Expose this | |
} | |
fn GPGPU_new_easy<'a>(source: &mut String, buffer: &'a mut wgpu::Buffer) -> GPGPU<'a> { | |
// In order to support larger arrays, we need to split the buffer length across them. Each of | |
// indices is allowed to be up to 65535 (yes, a 16-bit integer) leading to a maximum length of | |
// 65535^3, or about 2.815x10^14 elements (about 281 trillion elements). Not quite up to the | |
// 64-bit address space limit 2^64 or about 1.845x10^19 or about 18 quintillion elements, but | |
// enough for exactly 1PB of 32-bit numbers in an array, so we should be good. | |
// For now, the 65535 limit should be hardcoded by the shader author and an early exit | |
// conditional check if the shader is operating on a nonexistent array index. This may change | |
// in the future if the performance penalty of the bounds check is considered too high. | |
// | |
// Explaining the equation itself, the array length, L, needs to be split into X, Y, and Z | |
// parts where L = X + A*Y + B*Z, with X, Y, and Z bound between 0 and 65534 (inclusive) while | |
// A is 65535 and B is 65535^2 or 4294836225. Computing each dimension is to take the original | |
// length of the array (which is the buffer size divided by 4 because we're only supporting | |
// 32-bit numbers for now) and then getting the division and remainder first by the B constant, | |
// and the Z limit becomes the division + 1, while the remainder is executed division and | |
// remainder on the A constant, division + 1, and this remainder becomes the X limit (plus 1). | |
// Including this big explanation in case I've made an off-by-one error here ;) | |
let l: i64 = (buffer.size() / 4).try_into().unwrap(); | |
let z_div = l / 4294836225; | |
let z = z_div + 1; | |
let z_rem = l.wrapping_rem(4294836225); | |
let y_div = z_rem / 65535; | |
let y = y_div + 1; | |
let y_rem = z_rem.wrapping_rem(65535); | |
let x = y_rem + 1; | |
GPGPU::new(source.clone(), vec![vec![buffer]], [x, y, z]) | |
} | |
fn gpu_run(g: &mut GPU, gg: &mut GPGPU) { | |
let module = g.device.create_shader_module(wgpu::ShaderModuleDescriptor { | |
label: None, | |
source: wgpu::ShaderSource::Wgsl(std::borrow::Cow::Borrowed(&gg.source)), | |
}); | |
let compute_pipeline = g | |
.device | |
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { | |
label: None, | |
layout: None, | |
module: &module, | |
entry_point: &gg.entrypoint, | |
compilation_options: wgpu::PipelineCompilationOptions::default(), | |
}); | |
let mut bind_groups = Vec::new(); | |
let mut encoder = g | |
.device | |
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); | |
{ | |
let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { | |
label: None, | |
timestamp_writes: None, | |
}); | |
cpass.set_pipeline(&compute_pipeline); | |
for i in 0..gg.buffers.len() { | |
let bind_group_layout = compute_pipeline.get_bind_group_layout(i.try_into().unwrap()); | |
let bind_group_buffers = &gg.buffers[i]; | |
let mut bind_group_entries = Vec::new(); | |
for j in 0..bind_group_buffers.len() { | |
bind_group_entries.push(wgpu::BindGroupEntry { | |
binding: j.try_into().unwrap(), | |
resource: bind_group_buffers[j].as_entire_binding(), | |
}); | |
} | |
let bind_group = g.device.create_bind_group(&wgpu::BindGroupDescriptor { | |
label: None, | |
layout: &bind_group_layout, | |
entries: &bind_group_entries[..], | |
}); | |
bind_groups.push(bind_group); | |
} | |
for i in 0..gg.buffers.len() { | |
// The Rust borrow checker is forcing my hand here | |
cpass.set_bind_group(i.try_into().unwrap(), &bind_groups[i], &[]); | |
} | |
cpass.dispatch_workgroups( | |
gg.workgroup_sizes[0].try_into().unwrap(), | |
gg.workgroup_sizes[1].try_into().unwrap(), | |
gg.workgroup_sizes[2].try_into().unwrap(), | |
); | |
} | |
g.queue.submit(Some(encoder.finish())); | |
} | |
fn read_buffer(g: &mut GPU, b: &mut wgpu::Buffer) -> Vec<i32> { | |
// TODO: Support other value types | |
let temp_buffer = create_empty_buffer( | |
g, | |
&mut map_read_buffer_type(), | |
&mut b.size().try_into().unwrap(), | |
); | |
let mut encoder = g | |
.device | |
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); | |
encoder.copy_buffer_to_buffer(b, 0, &temp_buffer, 0, b.size()); | |
g.queue.submit(Some(encoder.finish())); | |
let temp_slice = temp_buffer.slice(..); | |
let (sender, receiver) = flume::bounded(1); | |
temp_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap()); | |
g.device.poll(wgpu::Maintain::wait()).panic_on_timeout(); | |
if let Ok(Ok(())) = receiver.recv() { | |
let data = temp_slice.get_mapped_range(); | |
let data_ptr = data.as_ptr(); | |
let data_len = data.len() / 4; // From u8 to i32 | |
let data_i32: &[i32] = | |
unsafe { std::slice::from_raw_parts(data_ptr as *const i32, data_len) }; | |
let result = data_i32.to_vec(); | |
drop(data); | |
temp_buffer.unmap(); | |
result | |
} else { | |
panic!("failed to run compute on gpu!") | |
} | |
} | |
fn main() { | |
let mut g = GPU_new(); | |
let mut b = create_buffer_init( | |
&mut g, | |
&mut storage_buffer_type(), | |
&mut filled(&mut i64toi32(&mut 2), &mut 4), | |
); | |
let mut plan = GPGPU_new_easy( | |
&mut " | |
@group(0) | |
@binding(0) | |
var<storage, read_write> vals: array<i32>; | |
@compute | |
@workgroup_size(1) | |
fn main(@builtin(global_invocation_id) id: vec3<u32>) { | |
vals[id.x] = vals[id.x] * bitcast<i32>(id.x); | |
} | |
" | |
.to_string(), | |
&mut b, | |
); | |
gpu_run(&mut g, &mut plan); | |
print_vec(&mut read_buffer(&mut g, &mut b)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment