Skip to content

Instantly share code, notes, and snippets.

@itzmeanjan
Created August 29, 2021 07:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save itzmeanjan/3f6c17217a0dec4a6a981ea7ecf6ab28 to your computer and use it in GitHub Desktop.
Save itzmeanjan/3f6c17217a0dec4a6a981ea7ecf6ab28 to your computer and use it in GitHub Desktop.
😎 Computing Matrix Transpose in Parallel on GPGPU, using Vulkan Compute API πŸ”₯
#version 450
#pragma shader_stage(compute)
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
layout(set = 0, binding = 0) buffer matrix_block {
int[1024][1024] matrix;
};
void main() {
const uint idx = gl_GlobalInvocationID.x;
if(idx == 0 || idx > 1024) {
return;
}
for(uint j = 0; j < idx; j++) {
const int tmp = matrix[idx][j];
matrix[idx][j] = matrix[j][idx];
matrix[j][idx] = tmp;
}
}
extern crate rand;
extern crate vulkano;
extern crate vulkano_shaders;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::sync::Arc;
use std::time::Instant;
use vulkano::buffer::{BufferUsage, CpuAccessibleBuffer};
use vulkano::command_buffer::{AutoCommandBufferBuilder, CommandBufferUsage, PrimaryCommandBuffer};
use vulkano::descriptor::descriptor_set::PersistentDescriptorSet;
use vulkano::device::{Device, DeviceExtensions, Features};
use vulkano::instance::PhysicalDevice;
use vulkano::instance::{Instance, InstanceExtensions};
use vulkano::pipeline::{ComputePipeline, ComputePipelineAbstract};
use vulkano::sync::GpuFuture;
use vulkano::Version;
const N: u32 = 1024;
fn main() {
let instance = Instance::new(None, Version::V1_2, &InstanceExtensions::none(), None)
.expect("failed to create instance !");
let physical_device = PhysicalDevice::enumerate(&instance)
.next()
.expect("failed to enumerate physical devices");
println!(
"Device: {}\nVulkan API: {}",
physical_device.properties().device_name.as_ref().unwrap(),
physical_device.api_version()
);
for i in physical_device.queue_families() {
println!(
"Queue Count: {}\tCompute: {}\tGraphics: {}",
i.queues_count(),
i.supports_compute(),
i.supports_graphics()
);
}
let queue_family = physical_device
.queue_families()
.find(|&v| v.supports_compute())
.expect("failed to find compute supported queue family");
let mut ext = DeviceExtensions::none();
ext.khr_storage_buffer_storage_class = true;
let (logical_device, mut queues) = Device::new(
physical_device,
&Features::none(),
&ext,
[(queue_family, 0.5)].iter().cloned(),
)
.expect("failed to create logical logical_device");
let queue = queues.next().expect("failed to find associated queue");
// to be used for preparing buffer which can be
// mapped to CPU accessible memory, used for performing
// transposition
let matrix_gpu = generate_matrix();
// to be used for computing transposition on CPU;
// also used for asserting result with GPU-computed transpose
let matrix_cpu = generate_matrix();
let matrix_buf = CpuAccessibleBuffer::from_iter(
logical_device.clone(),
BufferUsage::all(),
false,
matrix_gpu,
)
.expect("failed to create buffer");
// loading compute shader, including shader compilation
// abstracted with macro!
let shader = cs::Shader::load(logical_device.clone()).unwrap();
// preparing compute pipeline
let compute_pipeline = Arc::new(
ComputePipeline::new(
logical_device.clone(),
&shader.main_entry_point(),
&(),
None,
)
.unwrap(),
);
// adding descriptors as per layout, into compute pipeline
let layout = compute_pipeline.layout().descriptor_set_layout(0).unwrap();
let set = Arc::new(
PersistentDescriptorSet::start(layout.clone())
// only one buffer needed, because transposition is in-place
.add_buffer(matrix_buf.clone())
.unwrap()
.build()
.unwrap(),
);
// create command buffer & start recording commands in it
let mut builder = AutoCommandBufferBuilder::primary(
logical_device.clone(),
queue.family(),
CommandBufferUsage::OneTimeSubmit,
)
.unwrap();
// only single command recorded in command buffer
builder
.dispatch(
// launching 32 work groups in x-axis
[N / 32, 1, 1],
compute_pipeline.clone(),
set.clone(),
(),
std::iter::empty(),
)
.unwrap();
// ending command recording
let command_buffer = builder.build().unwrap();
// -- computing matrix transpose in GPU --
let start = Instant::now();
let finished = command_buffer.execute(queue.clone()).unwrap();
finished
.then_signal_fence_and_flush()
.unwrap()
.wait(None)
.unwrap();
println!("GPU matrix transpose: {:?}", start.elapsed());
// -- done with transposing matrix in GPU --
// -- computing matrix transpose in CPU --
let n = N as usize;
let start = Instant::now();
let mut matrix: Vec<_> = matrix_cpu.collect();
for i in 1..n {
for j in 0..i {
let tmp: i32 = matrix[i][j];
matrix[i][j] = matrix[j][i];
matrix[j][i] = tmp;
}
}
println!("CPU matrix transpose: {:?}", start.elapsed());
// -- done with transposing matrix in CPU --
// -- asserting CPU vs. GPU matrix transposition --
let start = Instant::now();
let r_matrix_buf = matrix_buf.read().unwrap();
for i in 0..n {
for j in 0..n {
assert_eq!(r_matrix_buf[i][j], matrix[i][j]);
}
}
println!("Transpose asserted: {:?}", start.elapsed());
}
fn generate_matrix() -> Box<dyn std::iter::ExactSizeIterator<Item = [i32; 1024]>> {
Box::new((0..N).map(move |_| {
// to make it easily reproducible
let mut rng = StdRng::seed_from_u64(17);
let mut arr = [0i32; 1024];
for i in 0..(N as usize) {
arr[i] = rng.gen::<i32>();
}
arr
}))
}
mod cs {
// does shader compilation
vulkano_shaders::shader! {
ty: "compute",
path: "./matrix_transpose.glsl",
vulkan_version: "1.2",
}
}
@itzmeanjan
Copy link
Author

Background

This code snippet accompanies post I wrote here on using Vulkan Compute API for transposing large square matrices.

Usage

  • Download this GIST
  • Create project directory tree by running
cargo init
  • Add following as dependencies in Cargo.toml
vulkano = "0.24.0"
vulkano-shaders = "0.24.0"
rand = "0.8.4"
  • In generated project directory's src/main.rs, paste this
  • Copy compute shader code, in file with same name, in root of cargo project.
  • Build & run project
cargo run --release

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment