Skip to content

Instantly share code, notes, and snippets.

@dfellis
Created June 10, 2024 14:53
Show Gist options
  • Save dfellis/79e3789e3fc3ea80580f334ec652404d to your computer and use it in GitHub Desktop.
Save dfellis/79e3789e3fc3ea80580f334ec652404d to your computer and use it in GitHub Desktop.
wgpu buffer freeing bug reproduction
[package]
name = "alan_generated_bin"
edition = "2021"
[dependencies]
flume = "0.11.0"
futures = "0.3.30"
wgpu = { version = "0.20.0", features = ["trace"] }
/// `i64toi32` casts an i64 to an i32.
#[inline(always)]
fn i64toi32(i: &i64) -> i32 {
*i as i32
}
/// `filled` returns a filled Vec<V> of the provided value for the provided size
#[inline(always)]
fn filled<V: std::clone::Clone>(i: &V, l: &i64) -> Vec<V> {
vec![i.clone(); *l as usize]
}
/// `print_vec` pretty prints a vector assuming the input type can be displayed
#[inline(always)]
fn print_vec<A: std::fmt::Display>(vs: &Vec<A>) {
println!(
"[{}]",
vs.iter()
.map(|v| format!("{}", v))
.collect::<Vec<String>>()
.join(", ")
);
}
struct GPU {
pub instance: wgpu::Instance,
pub adapter: wgpu::Adapter,
pub device: wgpu::Device,
pub queue: wgpu::Queue,
}
impl GPU {
pub fn new() -> Result<GPU, Box<dyn std::error::Error>> {
let instance = wgpu::Instance::default();
let mut adapters = instance.enumerate_adapters(wgpu::Backends::all());
// We're going to grab the first WebGPU-compliant adapter
adapters.reverse();
// TODO: This crashes if there are literally zero options
let mut adapter = adapters.pop().unwrap();
while adapters.len() > 0 {
if adapter.get_downlevel_capabilities().is_webgpu_compliant() {
break;
}
adapter = match adapters.pop() {
Some(a) => a,
None => {
// None of the adapters have the required WebGPU capabilities
// So let's try to request a fallback (software-supported) one
// to at least not crash
let fallback_options = wgpu::RequestAdapterOptions {
power_preference: wgpu::PowerPreference::None,
force_fallback_adapter: true,
compatible_surface: None,
};
let adapter_future = instance.request_adapter(&fallback_options);
match futures::executor::block_on(adapter_future) {
Some(a) => a,
None => panic!("Unable to acquire an adapter"),
}
}
};
}
// Temporarily override for software no matter what
// Let's ask the adapter for everything it can do
let features = adapter.features();
let limits = adapter.limits();
let device_future = adapter.request_device(
&wgpu::DeviceDescriptor {
label: None,
required_features: features,
required_limits: limits,
},
Some(&std::env::current_dir()?),
);
let (device, queue) = futures::executor::block_on(device_future)?;
Ok(GPU {
instance,
adapter,
device,
queue,
})
}
}
#[inline(always)]
fn GPU_new() -> GPU {
// TODO: Make this safer
match GPU::new() {
Ok(g) => g,
Err(_) => unreachable!(),
}
}
fn create_buffer_init(
g: &mut GPU,
usage: &mut wgpu::BufferUsages,
vals: &mut Vec<i32>,
) -> wgpu::Buffer {
let val_slice = &vals[..];
let val_ptr = val_slice.as_ptr();
let val_u8_len = vals.len() * 4;
let val_u8: &[u8] = unsafe { std::slice::from_raw_parts(val_ptr as *const u8, val_u8_len) };
wgpu::util::DeviceExt::create_buffer_init(
&g.device,
&wgpu::util::BufferInitDescriptor {
label: None, // TODO: Add a label for easier debugging?
contents: val_u8,
usage: *usage,
},
)
}
fn create_empty_buffer(
g: &mut GPU,
usage: &mut wgpu::BufferUsages,
size: &mut i64,
) -> wgpu::Buffer {
g.device.create_buffer(&wgpu::BufferDescriptor {
label: None, // TODO: Add a label for easier debugging?
size: *size as u64,
usage: *usage,
mapped_at_creation: false, // TODO: With `create_buffer_init` does this make any sense?
})
}
// TODO: Either add the ability to bind to const values, or come up with a better solution. For
// now, just hardwire a few buffer usage types in these functions
#[inline(always)]
fn map_read_buffer_type() -> wgpu::BufferUsages {
wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST
}
#[inline(always)]
fn storage_buffer_type() -> wgpu::BufferUsages {
wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC
}
type Vec_Vec_Buffer<'a> = Vec<Vec<&'a wgpu::Buffer>>;
#[inline(always)]
fn Vec_Vec_Buffer_new<'a>() -> Vec_Vec_Buffer<'a> {
Vec::new()
}
struct GPGPU<'a> {
pub source: String,
pub entrypoint: String,
pub buffers: Vec_Vec_Buffer<'a>,
pub workgroup_sizes: [i64; 3],
}
impl GPGPU<'_> {
fn new<'a>(
source: String,
buffers: Vec_Vec_Buffer<'a>,
workgroup_sizes: [i64; 3],
) -> GPGPU<'a> {
GPGPU {
source,
entrypoint: "main".to_string(),
buffers,
workgroup_sizes,
}
}
}
#[inline(always)]
fn GPGPU_new<'a>(source: &mut String, buffers: &'a mut Vec_Vec_Buffer) -> GPGPU<'a> {
GPGPU::new(source.clone(), buffers.clone(), [1, 1, 1]) // TODO: Expose this
}
fn GPGPU_new_easy<'a>(source: &mut String, buffer: &'a mut wgpu::Buffer) -> GPGPU<'a> {
// In order to support larger arrays, we need to split the buffer length across them. Each of
// indices is allowed to be up to 65535 (yes, a 16-bit integer) leading to a maximum length of
// 65535^3, or about 2.815x10^14 elements (about 281 trillion elements). Not quite up to the
// 64-bit address space limit 2^64 or about 1.845x10^19 or about 18 quintillion elements, but
// enough for exactly 1PB of 32-bit numbers in an array, so we should be good.
// For now, the 65535 limit should be hardcoded by the shader author and an early exit
// conditional check if the shader is operating on a nonexistent array index. This may change
// in the future if the performance penalty of the bounds check is considered too high.
//
// Explaining the equation itself, the array length, L, needs to be split into X, Y, and Z
// parts where L = X + A*Y + B*Z, with X, Y, and Z bound between 0 and 65534 (inclusive) while
// A is 65535 and B is 65535^2 or 4294836225. Computing each dimension is to take the original
// length of the array (which is the buffer size divided by 4 because we're only supporting
// 32-bit numbers for now) and then getting the division and remainder first by the B constant,
// and the Z limit becomes the division + 1, while the remainder is executed division and
// remainder on the A constant, division + 1, and this remainder becomes the X limit (plus 1).
// Including this big explanation in case I've made an off-by-one error here ;)
let l: i64 = (buffer.size() / 4).try_into().unwrap();
let z_div = l / 4294836225;
let z = z_div + 1;
let z_rem = l.wrapping_rem(4294836225);
let y_div = z_rem / 65535;
let y = y_div + 1;
let y_rem = z_rem.wrapping_rem(65535);
let x = y_rem + 1;
GPGPU::new(source.clone(), vec![vec![buffer]], [x, y, z])
}
fn gpu_run(g: &mut GPU, gg: &mut GPGPU) {
let module = g.device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: None,
source: wgpu::ShaderSource::Wgsl(std::borrow::Cow::Borrowed(&gg.source)),
});
let compute_pipeline = g
.device
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: None,
layout: None,
module: &module,
entry_point: &gg.entrypoint,
compilation_options: wgpu::PipelineCompilationOptions::default(),
});
let mut bind_groups = Vec::new();
let mut encoder = g
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
{
let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: None,
timestamp_writes: None,
});
cpass.set_pipeline(&compute_pipeline);
for i in 0..gg.buffers.len() {
let bind_group_layout = compute_pipeline.get_bind_group_layout(i.try_into().unwrap());
let bind_group_buffers = &gg.buffers[i];
let mut bind_group_entries = Vec::new();
for j in 0..bind_group_buffers.len() {
bind_group_entries.push(wgpu::BindGroupEntry {
binding: j.try_into().unwrap(),
resource: bind_group_buffers[j].as_entire_binding(),
});
}
let bind_group = g.device.create_bind_group(&wgpu::BindGroupDescriptor {
label: None,
layout: &bind_group_layout,
entries: &bind_group_entries[..],
});
bind_groups.push(bind_group);
}
for i in 0..gg.buffers.len() {
// The Rust borrow checker is forcing my hand here
cpass.set_bind_group(i.try_into().unwrap(), &bind_groups[i], &[]);
}
cpass.dispatch_workgroups(
gg.workgroup_sizes[0].try_into().unwrap(),
gg.workgroup_sizes[1].try_into().unwrap(),
gg.workgroup_sizes[2].try_into().unwrap(),
);
}
g.queue.submit(Some(encoder.finish()));
}
fn read_buffer(g: &mut GPU, b: &mut wgpu::Buffer) -> Vec<i32> {
// TODO: Support other value types
let temp_buffer = create_empty_buffer(
g,
&mut map_read_buffer_type(),
&mut b.size().try_into().unwrap(),
);
let mut encoder = g
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
encoder.copy_buffer_to_buffer(b, 0, &temp_buffer, 0, b.size());
g.queue.submit(Some(encoder.finish()));
let temp_slice = temp_buffer.slice(..);
let (sender, receiver) = flume::bounded(1);
temp_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
g.device.poll(wgpu::Maintain::wait()).panic_on_timeout();
if let Ok(Ok(())) = receiver.recv() {
let data = temp_slice.get_mapped_range();
let data_ptr = data.as_ptr();
let data_len = data.len() / 4; // From u8 to i32
let data_i32: &[i32] =
unsafe { std::slice::from_raw_parts(data_ptr as *const i32, data_len) };
let result = data_i32.to_vec();
drop(data);
temp_buffer.unmap();
result
} else {
panic!("failed to run compute on gpu!")
}
}
fn main() {
let mut g = GPU_new();
let mut b = create_buffer_init(
&mut g,
&mut storage_buffer_type(),
&mut filled(&mut i64toi32(&mut 2), &mut 4),
);
let mut plan = GPGPU_new_easy(
&mut "
@group(0)
@binding(0)
var<storage, read_write> vals: array<i32>;
@compute
@workgroup_size(1)
fn main(@builtin(global_invocation_id) id: vec3<u32>) {
vals[id.x] = vals[id.x] * bitcast<i32>(id.x);
}
"
.to_string(),
&mut b,
);
gpu_run(&mut g, &mut plan);
print_vec(&mut read_buffer(&mut g, &mut b));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment