dfellis/Cargo.toml

## Cargo.toml
[package]
name = "alan_generated_bin"
edition = "2021"

[dependencies]
flume = "0.11.0"
futures = "0.3.30"
wgpu = { version = "0.20.0", features = ["trace"] }

## main.rs
/// `i64toi32` casts an i64 to an i32.
#[inline(always)]
fn i64toi32(i: &i64) -> i32 {
    *i as i32
}

/// `filled` returns a filled Vec<V> of the provided value for the provided size
#[inline(always)]
fn filled<V: std::clone::Clone>(i: &V, l: &i64) -> Vec<V> {
    vec![i.clone(); *l as usize]
}

/// `print_vec` pretty prints a vector assuming the input type can be displayed
#[inline(always)]
fn print_vec<A: std::fmt::Display>(vs: &Vec<A>) {
    println!(
        "[{}]",
        vs.iter()
            .map(|v| format!("{}", v))
            .collect::<Vec<String>>()
            .join(", ")
    );
}

struct GPU {
    pub instance: wgpu::Instance,
    pub adapter: wgpu::Adapter,
    pub device: wgpu::Device,
    pub queue: wgpu::Queue,
}

impl GPU {
    pub fn new() -> Result<GPU, Box<dyn std::error::Error>> {
        let instance = wgpu::Instance::default();
        let mut adapters = instance.enumerate_adapters(wgpu::Backends::all());
        // We're going to grab the first WebGPU-compliant adapter
        adapters.reverse();
        // TODO: This crashes if there are literally zero options
        let mut adapter = adapters.pop().unwrap();
        while adapters.len() > 0 {
            if adapter.get_downlevel_capabilities().is_webgpu_compliant() {
                break;
            }
            adapter = match adapters.pop() {
                Some(a) => a,
                None => {
                    // None of the adapters have the required WebGPU capabilities
                    // So let's try to request a fallback (software-supported) one
                    // to at least not crash
                    let fallback_options = wgpu::RequestAdapterOptions {
                        power_preference: wgpu::PowerPreference::None,
                        force_fallback_adapter: true,
                        compatible_surface: None,
                    };
                    let adapter_future = instance.request_adapter(&fallback_options);
                    match futures::executor::block_on(adapter_future) {
                        Some(a) => a,
                        None => panic!("Unable to acquire an adapter"),
                    }
                }
            };
        }
        // Temporarily override for software no matter what
        // Let's ask the adapter for everything it can do
        let features = adapter.features();
        let limits = adapter.limits();
        let device_future = adapter.request_device(
            &wgpu::DeviceDescriptor {
                label: None,
                required_features: features,
                required_limits: limits,
            },
            Some(&std::env::current_dir()?),
        );
        let (device, queue) = futures::executor::block_on(device_future)?;
        Ok(GPU {
            instance,
            adapter,
            device,
            queue,
        })
    }
}

#[inline(always)]
fn GPU_new() -> GPU {
    // TODO: Make this safer
    match GPU::new() {
        Ok(g) => g,
        Err(_) => unreachable!(),
    }
}

fn create_buffer_init(
    g: &mut GPU,
    usage: &mut wgpu::BufferUsages,
    vals: &mut Vec<i32>,
) -> wgpu::Buffer {
    let val_slice = &vals[..];
    let val_ptr = val_slice.as_ptr();
    let val_u8_len = vals.len() * 4;
    let val_u8: &[u8] = unsafe { std::slice::from_raw_parts(val_ptr as *const u8, val_u8_len) };
    wgpu::util::DeviceExt::create_buffer_init(
        &g.device,
        &wgpu::util::BufferInitDescriptor {
            label: None, // TODO: Add a label for easier debugging?
            contents: val_u8,
            usage: *usage,
        },
    )
}

fn create_empty_buffer(
    g: &mut GPU,
    usage: &mut wgpu::BufferUsages,
    size: &mut i64,
) -> wgpu::Buffer {
    g.device.create_buffer(&wgpu::BufferDescriptor {
        label: None, // TODO: Add a label for easier debugging?
        size: *size as u64,
        usage: *usage,
        mapped_at_creation: false, // TODO: With `create_buffer_init` does this make any sense?
    })
}

// TODO: Either add the ability to bind to const values, or come up with a better solution. For
// now, just hardwire a few buffer usage types in these functions
#[inline(always)]
fn map_read_buffer_type() -> wgpu::BufferUsages {
    wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST
}

#[inline(always)]
fn storage_buffer_type() -> wgpu::BufferUsages {
    wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC
}

type Vec_Vec_Buffer<'a> = Vec<Vec<&'a wgpu::Buffer>>;

#[inline(always)]
fn Vec_Vec_Buffer_new<'a>() -> Vec_Vec_Buffer<'a> {
    Vec::new()
}

struct GPGPU<'a> {
    pub source: String,
    pub entrypoint: String,
    pub buffers: Vec_Vec_Buffer<'a>,
    pub workgroup_sizes: [i64; 3],
}

impl GPGPU<'_> {
    fn new<'a>(
        source: String,
        buffers: Vec_Vec_Buffer<'a>,
        workgroup_sizes: [i64; 3],
    ) -> GPGPU<'a> {
        GPGPU {
            source,
            entrypoint: "main".to_string(),
            buffers,
            workgroup_sizes,
        }
    }
}

#[inline(always)]
fn GPGPU_new<'a>(source: &mut String, buffers: &'a mut Vec_Vec_Buffer) -> GPGPU<'a> {
    GPGPU::new(source.clone(), buffers.clone(), [1, 1, 1]) // TODO: Expose this
}

fn GPGPU_new_easy<'a>(source: &mut String, buffer: &'a mut wgpu::Buffer) -> GPGPU<'a> {
    // In order to support larger arrays, we need to split the buffer length across them. Each of
    // indices is allowed to be up to 65535 (yes, a 16-bit integer) leading to a maximum length of
    // 65535^3, or about 2.815x10^14 elements (about 281 trillion elements). Not quite up to the
    // 64-bit address space limit 2^64 or about 1.845x10^19 or about 18 quintillion elements, but
    // enough for exactly 1PB of 32-bit numbers in an array, so we should be good.
    // For now, the 65535 limit should be hardcoded by the shader author and an early exit
    // conditional check if the shader is operating on a nonexistent array index. This may change
    // in the future if the performance penalty of the bounds check is considered too high.
    //
    // Explaining the equation itself, the array length, L, needs to be split into X, Y, and Z
    // parts where L = X + A*Y + B*Z, with X, Y, and Z bound between 0 and 65534 (inclusive) while
    // A is 65535 and B is 65535^2 or 4294836225. Computing each dimension is to take the original
    // length of the array (which is the buffer size divided by 4 because we're only supporting
    // 32-bit numbers for now) and then getting the division and remainder first by the B constant,
    // and the Z limit becomes the division + 1, while the remainder is executed division and
    // remainder on the A constant, division + 1, and this remainder becomes the X limit (plus 1).
    // Including this big explanation in case I've made an off-by-one error here ;)
    let l: i64 = (buffer.size() / 4).try_into().unwrap();
    let z_div = l / 4294836225;
    let z = z_div + 1;
    let z_rem = l.wrapping_rem(4294836225);
    let y_div = z_rem / 65535;
    let y = y_div + 1;
    let y_rem = z_rem.wrapping_rem(65535);
    let x = y_rem + 1;
    GPGPU::new(source.clone(), vec![vec![buffer]], [x, y, z])
}

fn gpu_run(g: &mut GPU, gg: &mut GPGPU) {
    let module = g.device.create_shader_module(wgpu::ShaderModuleDescriptor {
        label: None,
        source: wgpu::ShaderSource::Wgsl(std::borrow::Cow::Borrowed(&gg.source)),
    });
    let compute_pipeline = g
        .device
        .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
            label: None,
            layout: None,
            module: &module,
            entry_point: &gg.entrypoint,
            compilation_options: wgpu::PipelineCompilationOptions::default(),
        });
    let mut bind_groups = Vec::new();
    let mut encoder = g
        .device
        .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
    {
        let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
            label: None,
            timestamp_writes: None,
        });
        cpass.set_pipeline(&compute_pipeline);
        for i in 0..gg.buffers.len() {
            let bind_group_layout = compute_pipeline.get_bind_group_layout(i.try_into().unwrap());
            let bind_group_buffers = &gg.buffers[i];
            let mut bind_group_entries = Vec::new();
            for j in 0..bind_group_buffers.len() {
                bind_group_entries.push(wgpu::BindGroupEntry {
                    binding: j.try_into().unwrap(),
                    resource: bind_group_buffers[j].as_entire_binding(),
                });
            }
            let bind_group = g.device.create_bind_group(&wgpu::BindGroupDescriptor {
                label: None,
                layout: &bind_group_layout,
                entries: &bind_group_entries[..],
            });
            bind_groups.push(bind_group);
        }
        for i in 0..gg.buffers.len() {
            // The Rust borrow checker is forcing my hand here
            cpass.set_bind_group(i.try_into().unwrap(), &bind_groups[i], &[]);
        }
        cpass.dispatch_workgroups(
            gg.workgroup_sizes[0].try_into().unwrap(),
            gg.workgroup_sizes[1].try_into().unwrap(),
            gg.workgroup_sizes[2].try_into().unwrap(),
        );
    }
    g.queue.submit(Some(encoder.finish()));
}

fn read_buffer(g: &mut GPU, b: &mut wgpu::Buffer) -> Vec<i32> {
    // TODO: Support other value types
    let temp_buffer = create_empty_buffer(
        g,
        &mut map_read_buffer_type(),
        &mut b.size().try_into().unwrap(),
    );
    let mut encoder = g
        .device
        .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
    encoder.copy_buffer_to_buffer(b, 0, &temp_buffer, 0, b.size());
    g.queue.submit(Some(encoder.finish()));
    let temp_slice = temp_buffer.slice(..);
    let (sender, receiver) = flume::bounded(1);
    temp_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
    g.device.poll(wgpu::Maintain::wait()).panic_on_timeout();
    if let Ok(Ok(())) = receiver.recv() {
        let data = temp_slice.get_mapped_range();
        let data_ptr = data.as_ptr();
        let data_len = data.len() / 4; // From u8 to i32
        let data_i32: &[i32] =
            unsafe { std::slice::from_raw_parts(data_ptr as *const i32, data_len) };
        let result = data_i32.to_vec();
        drop(data);
        temp_buffer.unmap();
        result
    } else {
        panic!("failed to run compute on gpu!")
    }
}

fn main() {
    let mut g = GPU_new();
    let mut b = create_buffer_init(
        &mut g,
        &mut storage_buffer_type(),
        &mut filled(&mut i64toi32(&mut 2), &mut 4),
    );
    let mut plan = GPGPU_new_easy(
        &mut "
        @group(0)
        @binding(0)
        var<storage, read_write> vals: array<i32>;

        @compute
        @workgroup_size(1)
        fn main(@builtin(global_invocation_id) id: vec3<u32>) {
          vals[id.x] = vals[id.x] * bitcast<i32>(id.x);
        }
      "
        .to_string(),
        &mut b,
    );
    gpu_run(&mut g, &mut plan);
    print_vec(&mut read_buffer(&mut g, &mut b));
}
	[package]
	name = "alan_generated_bin"
	edition = "2021"

	[dependencies]
	flume = "0.11.0"
	futures = "0.3.30"
	wgpu = { version = "0.20.0", features = ["trace"] }
	/// `i64toi32` casts an i64 to an i32.
	#[inline(always)]
	fn i64toi32(i: &i64) -> i32 {
	*i as i32
	}

	/// `filled` returns a filled Vec<V> of the provided value for the provided size
	#[inline(always)]
	fn filled<V: std::clone::Clone>(i: &V, l: &i64) -> Vec<V> {
	vec![i.clone(); *l as usize]
	}

	/// `print_vec` pretty prints a vector assuming the input type can be displayed
	#[inline(always)]
	fn print_vec<A: std::fmt::Display>(vs: &Vec<A>) {
	println!(
	"[{}]",
	vs.iter()
	.map(\|v\| format!("{}", v))
	.collect::<Vec<String>>()
	.join(", ")
	);
	}

	struct GPU {
	pub instance: wgpu::Instance,
	pub adapter: wgpu::Adapter,
	pub device: wgpu::Device,
	pub queue: wgpu::Queue,
	}

	impl GPU {
	pub fn new() -> Result<GPU, Box<dyn std::error::Error>> {
	let instance = wgpu::Instance::default();
	let mut adapters = instance.enumerate_adapters(wgpu::Backends::all());
	// We're going to grab the first WebGPU-compliant adapter
	adapters.reverse();
	// TODO: This crashes if there are literally zero options
	let mut adapter = adapters.pop().unwrap();
	while adapters.len() > 0 {
	if adapter.get_downlevel_capabilities().is_webgpu_compliant() {
	break;
	}
	adapter = match adapters.pop() {
	Some(a) => a,
	None => {
	// None of the adapters have the required WebGPU capabilities
	// So let's try to request a fallback (software-supported) one
	// to at least not crash
	let fallback_options = wgpu::RequestAdapterOptions {
	power_preference: wgpu::PowerPreference::None,
	force_fallback_adapter: true,
	compatible_surface: None,
	};
	let adapter_future = instance.request_adapter(&fallback_options);
	match futures::executor::block_on(adapter_future) {
	Some(a) => a,
	None => panic!("Unable to acquire an adapter"),
	}
	}
	};
	}
	// Temporarily override for software no matter what
	// Let's ask the adapter for everything it can do
	let features = adapter.features();
	let limits = adapter.limits();
	let device_future = adapter.request_device(
	&wgpu::DeviceDescriptor {
	label: None,
	required_features: features,
	required_limits: limits,
	},
	Some(&std::env::current_dir()?),
	);
	let (device, queue) = futures::executor::block_on(device_future)?;
	Ok(GPU {
	instance,
	adapter,
	device,
	queue,
	})
	}
	}

	#[inline(always)]
	fn GPU_new() -> GPU {
	// TODO: Make this safer
	match GPU::new() {
	Ok(g) => g,
	Err(_) => unreachable!(),
	}
	}

	fn create_buffer_init(
	g: &mut GPU,
	usage: &mut wgpu::BufferUsages,
	vals: &mut Vec<i32>,
	) -> wgpu::Buffer {
	let val_slice = &vals[..];
	let val_ptr = val_slice.as_ptr();
	let val_u8_len = vals.len() * 4;
	let val_u8: &[u8] = unsafe { std::slice::from_raw_parts(val_ptr as *const u8, val_u8_len) };
	wgpu::util::DeviceExt::create_buffer_init(
	&g.device,
	&wgpu::util::BufferInitDescriptor {
	label: None, // TODO: Add a label for easier debugging?
	contents: val_u8,
	usage: *usage,
	},
	)
	}

	fn create_empty_buffer(
	g: &mut GPU,
	usage: &mut wgpu::BufferUsages,
	size: &mut i64,
	) -> wgpu::Buffer {
	g.device.create_buffer(&wgpu::BufferDescriptor {
	label: None, // TODO: Add a label for easier debugging?
	size: *size as u64,
	usage: *usage,
	mapped_at_creation: false, // TODO: With `create_buffer_init` does this make any sense?
	})
	}

	// TODO: Either add the ability to bind to const values, or come up with a better solution. For
	// now, just hardwire a few buffer usage types in these functions
	#[inline(always)]
	fn map_read_buffer_type() -> wgpu::BufferUsages {
	wgpu::BufferUsages::MAP_READ \| wgpu::BufferUsages::COPY_DST
	}

	#[inline(always)]
	fn storage_buffer_type() -> wgpu::BufferUsages {
	wgpu::BufferUsages::STORAGE \| wgpu::BufferUsages::COPY_DST \| wgpu::BufferUsages::COPY_SRC
	}

	type Vec_Vec_Buffer<'a> = Vec<Vec<&'a wgpu::Buffer>>;

	#[inline(always)]
	fn Vec_Vec_Buffer_new<'a>() -> Vec_Vec_Buffer<'a> {
	Vec::new()
	}

	struct GPGPU<'a> {
	pub source: String,
	pub entrypoint: String,
	pub buffers: Vec_Vec_Buffer<'a>,
	pub workgroup_sizes: [i64; 3],
	}

	impl GPGPU<'_> {
	fn new<'a>(
	source: String,
	buffers: Vec_Vec_Buffer<'a>,
	workgroup_sizes: [i64; 3],
	) -> GPGPU<'a> {
	GPGPU {
	source,
	entrypoint: "main".to_string(),
	buffers,
	workgroup_sizes,
	}
	}
	}

	#[inline(always)]
	fn GPGPU_new<'a>(source: &mut String, buffers: &'a mut Vec_Vec_Buffer) -> GPGPU<'a> {
	GPGPU::new(source.clone(), buffers.clone(), [1, 1, 1]) // TODO: Expose this
	}

	fn GPGPU_new_easy<'a>(source: &mut String, buffer: &'a mut wgpu::Buffer) -> GPGPU<'a> {
	// In order to support larger arrays, we need to split the buffer length across them. Each of
	// indices is allowed to be up to 65535 (yes, a 16-bit integer) leading to a maximum length of
	// 65535^3, or about 2.815x10^14 elements (about 281 trillion elements). Not quite up to the
	// 64-bit address space limit 2^64 or about 1.845x10^19 or about 18 quintillion elements, but
	// enough for exactly 1PB of 32-bit numbers in an array, so we should be good.
	// For now, the 65535 limit should be hardcoded by the shader author and an early exit
	// conditional check if the shader is operating on a nonexistent array index. This may change
	// in the future if the performance penalty of the bounds check is considered too high.
	//
	// Explaining the equation itself, the array length, L, needs to be split into X, Y, and Z
	// parts where L = X + AY + BZ, with X, Y, and Z bound between 0 and 65534 (inclusive) while
	// A is 65535 and B is 65535^2 or 4294836225. Computing each dimension is to take the original
	// length of the array (which is the buffer size divided by 4 because we're only supporting
	// 32-bit numbers for now) and then getting the division and remainder first by the B constant,
	// and the Z limit becomes the division + 1, while the remainder is executed division and
	// remainder on the A constant, division + 1, and this remainder becomes the X limit (plus 1).
	// Including this big explanation in case I've made an off-by-one error here ;)
	let l: i64 = (buffer.size() / 4).try_into().unwrap();
	let z_div = l / 4294836225;
	let z = z_div + 1;
	let z_rem = l.wrapping_rem(4294836225);
	let y_div = z_rem / 65535;
	let y = y_div + 1;
	let y_rem = z_rem.wrapping_rem(65535);
	let x = y_rem + 1;
	GPGPU::new(source.clone(), vec![vec![buffer]], [x, y, z])
	}

	fn gpu_run(g: &mut GPU, gg: &mut GPGPU) {
	let module = g.device.create_shader_module(wgpu::ShaderModuleDescriptor {
	label: None,
	source: wgpu::ShaderSource::Wgsl(std::borrow::Cow::Borrowed(&gg.source)),
	});
	let compute_pipeline = g
	.device
	.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
	label: None,
	layout: None,
	module: &module,
	entry_point: &gg.entrypoint,
	compilation_options: wgpu::PipelineCompilationOptions::default(),
	});
	let mut bind_groups = Vec::new();
	let mut encoder = g
	.device
	.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
	{
	let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
	label: None,
	timestamp_writes: None,
	});
	cpass.set_pipeline(&compute_pipeline);
	for i in 0..gg.buffers.len() {
	let bind_group_layout = compute_pipeline.get_bind_group_layout(i.try_into().unwrap());
	let bind_group_buffers = &gg.buffers[i];
	let mut bind_group_entries = Vec::new();
	for j in 0..bind_group_buffers.len() {
	bind_group_entries.push(wgpu::BindGroupEntry {
	binding: j.try_into().unwrap(),
	resource: bind_group_buffers[j].as_entire_binding(),
	});
	}
	let bind_group = g.device.create_bind_group(&wgpu::BindGroupDescriptor {
	label: None,
	layout: &bind_group_layout,
	entries: &bind_group_entries[..],
	});
	bind_groups.push(bind_group);
	}
	for i in 0..gg.buffers.len() {
	// The Rust borrow checker is forcing my hand here
	cpass.set_bind_group(i.try_into().unwrap(), &bind_groups[i], &[]);
	}
	cpass.dispatch_workgroups(
	gg.workgroup_sizes[0].try_into().unwrap(),
	gg.workgroup_sizes[1].try_into().unwrap(),
	gg.workgroup_sizes[2].try_into().unwrap(),
	);
	}
	g.queue.submit(Some(encoder.finish()));
	}

	fn read_buffer(g: &mut GPU, b: &mut wgpu::Buffer) -> Vec<i32> {
	// TODO: Support other value types
	let temp_buffer = create_empty_buffer(
	g,
	&mut map_read_buffer_type(),
	&mut b.size().try_into().unwrap(),
	);
	let mut encoder = g
	.device
	.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
	encoder.copy_buffer_to_buffer(b, 0, &temp_buffer, 0, b.size());
	g.queue.submit(Some(encoder.finish()));
	let temp_slice = temp_buffer.slice(..);
	let (sender, receiver) = flume::bounded(1);
	temp_slice.map_async(wgpu::MapMode::Read, move \|v\| sender.send(v).unwrap());
	g.device.poll(wgpu::Maintain::wait()).panic_on_timeout();
	if let Ok(Ok(())) = receiver.recv() {
	let data = temp_slice.get_mapped_range();
	let data_ptr = data.as_ptr();
	let data_len = data.len() / 4; // From u8 to i32
	let data_i32: &[i32] =
	unsafe { std::slice::from_raw_parts(data_ptr as *const i32, data_len) };
	let result = data_i32.to_vec();
	drop(data);
	temp_buffer.unmap();
	result
	} else {
	panic!("failed to run compute on gpu!")
	}
	}

	fn main() {
	let mut g = GPU_new();
	let mut b = create_buffer_init(
	&mut g,
	&mut storage_buffer_type(),
	&mut filled(&mut i64toi32(&mut 2), &mut 4),
	);
	let mut plan = GPGPU_new_easy(
	&mut "
	@group(0)
	@binding(0)
	var<storage, read_write> vals: array<i32>;

	@compute
	@workgroup_size(1)
	fn main(@builtin(global_invocation_id) id: vec3<u32>) {
	vals[id.x] = vals[id.x] * bitcast<i32>(id.x);
	}
	"
	.to_string(),
	&mut b,
	);
	gpu_run(&mut g, &mut plan);
	print_vec(&mut read_buffer(&mut g, &mut b));
	}