Created
August 29, 2021 07:06
-
-
Save itzmeanjan/3f6c17217a0dec4a6a981ea7ecf6ab28 to your computer and use it in GitHub Desktop.
π Computing Matrix Transpose in Parallel on GPGPU, using Vulkan Compute API π₯
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#version 450 | |
#pragma shader_stage(compute) | |
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; | |
layout(set = 0, binding = 0) buffer matrix_block { | |
int[1024][1024] matrix; | |
}; | |
void main() { | |
const uint idx = gl_GlobalInvocationID.x; | |
if(idx == 0 || idx > 1024) { | |
return; | |
} | |
for(uint j = 0; j < idx; j++) { | |
const int tmp = matrix[idx][j]; | |
matrix[idx][j] = matrix[j][idx]; | |
matrix[j][idx] = tmp; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extern crate rand; | |
extern crate vulkano; | |
extern crate vulkano_shaders; | |
use rand::rngs::StdRng; | |
use rand::{Rng, SeedableRng}; | |
use std::sync::Arc; | |
use std::time::Instant; | |
use vulkano::buffer::{BufferUsage, CpuAccessibleBuffer}; | |
use vulkano::command_buffer::{AutoCommandBufferBuilder, CommandBufferUsage, PrimaryCommandBuffer}; | |
use vulkano::descriptor::descriptor_set::PersistentDescriptorSet; | |
use vulkano::device::{Device, DeviceExtensions, Features}; | |
use vulkano::instance::PhysicalDevice; | |
use vulkano::instance::{Instance, InstanceExtensions}; | |
use vulkano::pipeline::{ComputePipeline, ComputePipelineAbstract}; | |
use vulkano::sync::GpuFuture; | |
use vulkano::Version; | |
const N: u32 = 1024; | |
fn main() { | |
let instance = Instance::new(None, Version::V1_2, &InstanceExtensions::none(), None) | |
.expect("failed to create instance !"); | |
let physical_device = PhysicalDevice::enumerate(&instance) | |
.next() | |
.expect("failed to enumerate physical devices"); | |
println!( | |
"Device: {}\nVulkan API: {}", | |
physical_device.properties().device_name.as_ref().unwrap(), | |
physical_device.api_version() | |
); | |
for i in physical_device.queue_families() { | |
println!( | |
"Queue Count: {}\tCompute: {}\tGraphics: {}", | |
i.queues_count(), | |
i.supports_compute(), | |
i.supports_graphics() | |
); | |
} | |
let queue_family = physical_device | |
.queue_families() | |
.find(|&v| v.supports_compute()) | |
.expect("failed to find compute supported queue family"); | |
let mut ext = DeviceExtensions::none(); | |
ext.khr_storage_buffer_storage_class = true; | |
let (logical_device, mut queues) = Device::new( | |
physical_device, | |
&Features::none(), | |
&ext, | |
[(queue_family, 0.5)].iter().cloned(), | |
) | |
.expect("failed to create logical logical_device"); | |
let queue = queues.next().expect("failed to find associated queue"); | |
// to be used for preparing buffer which can be | |
// mapped to CPU accessible memory, used for performing | |
// transposition | |
let matrix_gpu = generate_matrix(); | |
// to be used for computing transposition on CPU; | |
// also used for asserting result with GPU-computed transpose | |
let matrix_cpu = generate_matrix(); | |
let matrix_buf = CpuAccessibleBuffer::from_iter( | |
logical_device.clone(), | |
BufferUsage::all(), | |
false, | |
matrix_gpu, | |
) | |
.expect("failed to create buffer"); | |
// loading compute shader, including shader compilation | |
// abstracted with macro! | |
let shader = cs::Shader::load(logical_device.clone()).unwrap(); | |
// preparing compute pipeline | |
let compute_pipeline = Arc::new( | |
ComputePipeline::new( | |
logical_device.clone(), | |
&shader.main_entry_point(), | |
&(), | |
None, | |
) | |
.unwrap(), | |
); | |
// adding descriptors as per layout, into compute pipeline | |
let layout = compute_pipeline.layout().descriptor_set_layout(0).unwrap(); | |
let set = Arc::new( | |
PersistentDescriptorSet::start(layout.clone()) | |
// only one buffer needed, because transposition is in-place | |
.add_buffer(matrix_buf.clone()) | |
.unwrap() | |
.build() | |
.unwrap(), | |
); | |
// create command buffer & start recording commands in it | |
let mut builder = AutoCommandBufferBuilder::primary( | |
logical_device.clone(), | |
queue.family(), | |
CommandBufferUsage::OneTimeSubmit, | |
) | |
.unwrap(); | |
// only single command recorded in command buffer | |
builder | |
.dispatch( | |
// launching 32 work groups in x-axis | |
[N / 32, 1, 1], | |
compute_pipeline.clone(), | |
set.clone(), | |
(), | |
std::iter::empty(), | |
) | |
.unwrap(); | |
// ending command recording | |
let command_buffer = builder.build().unwrap(); | |
// -- computing matrix transpose in GPU -- | |
let start = Instant::now(); | |
let finished = command_buffer.execute(queue.clone()).unwrap(); | |
finished | |
.then_signal_fence_and_flush() | |
.unwrap() | |
.wait(None) | |
.unwrap(); | |
println!("GPU matrix transpose: {:?}", start.elapsed()); | |
// -- done with transposing matrix in GPU -- | |
// -- computing matrix transpose in CPU -- | |
let n = N as usize; | |
let start = Instant::now(); | |
let mut matrix: Vec<_> = matrix_cpu.collect(); | |
for i in 1..n { | |
for j in 0..i { | |
let tmp: i32 = matrix[i][j]; | |
matrix[i][j] = matrix[j][i]; | |
matrix[j][i] = tmp; | |
} | |
} | |
println!("CPU matrix transpose: {:?}", start.elapsed()); | |
// -- done with transposing matrix in CPU -- | |
// -- asserting CPU vs. GPU matrix transposition -- | |
let start = Instant::now(); | |
let r_matrix_buf = matrix_buf.read().unwrap(); | |
for i in 0..n { | |
for j in 0..n { | |
assert_eq!(r_matrix_buf[i][j], matrix[i][j]); | |
} | |
} | |
println!("Transpose asserted: {:?}", start.elapsed()); | |
} | |
fn generate_matrix() -> Box<dyn std::iter::ExactSizeIterator<Item = [i32; 1024]>> { | |
Box::new((0..N).map(move |_| { | |
// to make it easily reproducible | |
let mut rng = StdRng::seed_from_u64(17); | |
let mut arr = [0i32; 1024]; | |
for i in 0..(N as usize) { | |
arr[i] = rng.gen::<i32>(); | |
} | |
arr | |
})) | |
} | |
mod cs { | |
// does shader compilation | |
vulkano_shaders::shader! { | |
ty: "compute", | |
path: "./matrix_transpose.glsl", | |
vulkan_version: "1.2", | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Background
This code snippet accompanies post I wrote here on using Vulkan Compute API for transposing large square matrices.
Usage
Cargo.toml
src/main.rs
, paste this