Skip to content

Instantly share code, notes, and snippets.

@itzmeanjan
Last active January 14, 2023 21:22
Embed
What would you like to do?
😎 Parallel Matrix Multiplication on GPGPU, using Vulkan Compute API 🚴🏼
#version 450
#pragma shader_stage(compute)
layout(local_size_x = 8, local_size_y = 4, local_size_z = 1) in;
layout(set = 0, binding = 0) buffer readonly MatrixA {
int[1<<20] matrix_a;
};
layout(set = 0, binding = 1) buffer readonly MatrixB {
int[1<<20] matrix_b;
};
layout(set = 0, binding = 2) buffer writeonly MatrixC {
int[1<<20] matrix_c;
};
void main() {
const uint row = gl_GlobalInvocationID.x;
const uint col = gl_GlobalInvocationID.y;
if(row >= 1024 || col >= 1024) {
return;
}
int sum = 0;
for(uint i = 0; i < 1024; i++) {
sum += matrix_a[row * 1024 + i] * matrix_b[i * 1024 + col];
}
matrix_c[row * 1024 + col] = sum;
}
extern crate rand;
extern crate vulkano;
extern crate vulkano_shaders;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::sync::Arc;
use std::time::Instant;
use vulkano::buffer::{BufferUsage, CpuAccessibleBuffer, ImmutableBuffer};
use vulkano::command_buffer::{AutoCommandBufferBuilder, CommandBufferUsage, PrimaryCommandBuffer};
use vulkano::descriptor::descriptor_set::PersistentDescriptorSet;
use vulkano::device::{Device, DeviceExtensions, Features};
use vulkano::instance::PhysicalDevice;
use vulkano::instance::{Instance, InstanceExtensions};
use vulkano::pipeline::{ComputePipeline, ComputePipelineAbstract};
use vulkano::sync::GpuFuture;
use vulkano::Version;
const N: u32 = 1 << 20;
fn main() {
let instance = Instance::new(None, Version::V1_2, &InstanceExtensions::none(), None)
.expect("failed to create instance !");
let physical_device = PhysicalDevice::enumerate(&instance)
.next()
.expect("failed to enumerate physical devices");
println!(
"Device: {}\nVulkan API: {}",
physical_device.properties().device_name.as_ref().unwrap(),
physical_device.api_version()
);
for i in physical_device.queue_families() {
println!(
"Queue Count: {}\tCompute: {}\tGraphics: {}",
i.queues_count(),
i.supports_compute(),
i.supports_graphics()
);
}
let queue_family = physical_device
.queue_families()
.find(|&v| v.supports_compute())
.expect("failed to find compute supported queue family");
let mut ext = DeviceExtensions::none();
ext.khr_storage_buffer_storage_class = true;
let (logical_device, mut queues) = Device::new(
physical_device,
&Features::none(),
&ext,
[(queue_family, 0.5)].iter().cloned(),
)
.expect("failed to create logical logical_device");
let queue = queues.next().expect("failed to find associated queue");
let matrix_a = generate_square_matrix(Some(13));
let matrix_b = generate_square_matrix(Some(17));
let matrix_c = generate_square_matrix(None);
// Matrix A --- stored in GPU accessible memory, CPU can't access it
let (matrix_a_buf, _) = ImmutableBuffer::from_iter(matrix_a, BufferUsage::all(), queue.clone())
.expect("failed to create uniform buffer");
// Matrix B --- stored in GPU accessible memory, CPU can't access it
let (matrix_b_buf, _) = ImmutableBuffer::from_iter(matrix_b, BufferUsage::all(), queue.clone())
.expect("failed to create uniform buffer");
// Matrix C --- resulting matrix can be accessed by both CPU, GPU
let matrix_c_buf =
CpuAccessibleBuffer::from_iter(logical_device.clone(), BufferUsage::all(), false, matrix_c)
.expect("failed to create storage buffer");
// loading compute shader, including shader compilation
// abstracted with macro!
let shader = cs::Shader::load(logical_device.clone()).unwrap();
// preparing compute pipeline
let compute_pipeline = Arc::new(
ComputePipeline::new(
logical_device.clone(),
&shader.main_entry_point(),
&(),
None,
)
.unwrap(),
);
// adding descriptors as per layout, into compute pipeline
let layout = compute_pipeline.layout().descriptor_set_layout(0).unwrap();
let set = Arc::new(
PersistentDescriptorSet::start(layout.clone())
.add_buffer(matrix_a_buf.clone())
.unwrap()
.add_buffer(matrix_b_buf.clone())
.unwrap()
.add_buffer(matrix_c_buf.clone())
.unwrap()
.build()
.unwrap(),
);
// create command buffer & start recording commands in it
let mut builder = AutoCommandBufferBuilder::primary(
logical_device.clone(),
queue.family(),
CommandBufferUsage::OneTimeSubmit,
)
.unwrap();
// only single command recorded in command buffer
builder
.dispatch(
[1024 / 8, 1024 / 4, 1],
compute_pipeline.clone(),
set.clone(),
(),
std::iter::empty(),
)
.unwrap();
// ending command recording
let command_buffer = builder.build().unwrap();
// Computing Matrix Multiplication on GPU
let start = Instant::now();
let finished = command_buffer.execute(queue.clone()).unwrap();
finished
.then_signal_fence_and_flush()
.unwrap()
.wait(None)
.unwrap();
let gpu_tm = start.elapsed();
println!("GPU matrix multiply: {:?}", gpu_tm);
let r_matrix_a = generate_square_matrix(Some(13)).collect::<Vec<i32>>();
let r_matrix_b = generate_square_matrix(Some(17)).collect::<Vec<i32>>();
// reading GPU-computed matrix multiplication result
let gpu_result = matrix_c_buf.read().unwrap();
// Computing Matrix Multiplication on CPU, and asserting !
let start = Instant::now();
for i in 0..1024 {
for j in 0..1024 {
let mut sum = 0i32;
for k in 0..1024 {
sum += r_matrix_a[i * 1024 + k] * r_matrix_b[k * 1024 + j];
}
assert_eq!(sum, gpu_result[i * 1024 + j]);
}
}
println!(
"CPU matrix multiply: {:?}\nSpeed Up: {}",
start.elapsed(),
start.elapsed().as_nanos() / gpu_tm.as_nanos()
);
}
// reproducible random matrix generator, as single dimensional iterator
fn generate_square_matrix(seed: Option<u64>) -> Box<dyn std::iter::ExactSizeIterator<Item = i32>> {
match seed {
Some(seed) => {
let mut rng = StdRng::seed_from_u64(seed);
Box::new((0..N).map(move |_| rng.gen::<i32>()))
}
None => Box::new((0..N).map(|_| 0)),
}
}
mod cs {
// does shader compilation
vulkano_shaders::shader! {
ty: "compute",
path: "./matrix_multiply.glsl",
vulkan_version: "1.2",
}
}
@itzmeanjan
Copy link
Author

Thanks @seddonm1

@mjaric
Copy link

mjaric commented Jan 14, 2023

Comparation is not fair, CPU is doing calculation in single thread. Dot product those two arrays (that is more than shader does), using ndarray takes 275 micro seconds on 16 vCore CPU.

Using device: Radeon RX 590 Series (type: DiscreteGpu). API version: 1.3.209
GPU matrix multiply: 28.4929ms
CPU matrix multiply: 548.6Β΅s <-- dot product using ndarray
Speed Up: 0x

Anyhow, example is nice :) TNX!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment