Skip to content

Instantly share code, notes, and snippets.

Last active May 28, 2024 10:02
Show Gist options
  • Save itzmeanjan/84613bc7595372c5e6b6c22481d42f9a to your computer and use it in GitHub Desktop.
Save itzmeanjan/84613bc7595372c5e6b6c22481d42f9a to your computer and use it in GitHub Desktop.
😎 Parallel Matrix Multiplication on GPGPU, using Vulkan Compute API 🚴🏼
#version 450
#pragma shader_stage(compute)
layout(local_size_x = 8, local_size_y = 4, local_size_z = 1) in;
layout(set = 0, binding = 0) buffer readonly MatrixA {
int[1<<20] matrix_a;
layout(set = 0, binding = 1) buffer readonly MatrixB {
int[1<<20] matrix_b;
layout(set = 0, binding = 2) buffer writeonly MatrixC {
int[1<<20] matrix_c;
void main() {
const uint row = gl_GlobalInvocationID.x;
const uint col = gl_GlobalInvocationID.y;
if(row >= 1024 || col >= 1024) {
int sum = 0;
for(uint i = 0; i < 1024; i++) {
sum += matrix_a[row * 1024 + i] * matrix_b[i * 1024 + col];
matrix_c[row * 1024 + col] = sum;
extern crate rand;
extern crate vulkano;
extern crate vulkano_shaders;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::sync::Arc;
use std::time::Instant;
use vulkano::buffer::{BufferUsage, CpuAccessibleBuffer, ImmutableBuffer};
use vulkano::command_buffer::{AutoCommandBufferBuilder, CommandBufferUsage, PrimaryCommandBuffer};
use vulkano::descriptor::descriptor_set::PersistentDescriptorSet;
use vulkano::device::{Device, DeviceExtensions, Features};
use vulkano::instance::PhysicalDevice;
use vulkano::instance::{Instance, InstanceExtensions};
use vulkano::pipeline::{ComputePipeline, ComputePipelineAbstract};
use vulkano::sync::GpuFuture;
use vulkano::Version;
const N: u32 = 1 << 20;
fn main() {
let instance = Instance::new(None, Version::V1_2, &InstanceExtensions::none(), None)
.expect("failed to create instance !");
let physical_device = PhysicalDevice::enumerate(&instance)
.expect("failed to enumerate physical devices");
"Device: {}\nVulkan API: {}",,
for i in physical_device.queue_families() {
"Queue Count: {}\tCompute: {}\tGraphics: {}",
let queue_family = physical_device
.find(|&v| v.supports_compute())
.expect("failed to find compute supported queue family");
let mut ext = DeviceExtensions::none();
ext.khr_storage_buffer_storage_class = true;
let (logical_device, mut queues) = Device::new(
[(queue_family, 0.5)].iter().cloned(),
.expect("failed to create logical logical_device");
let queue ="failed to find associated queue");
let matrix_a = generate_square_matrix(Some(13));
let matrix_b = generate_square_matrix(Some(17));
let matrix_c = generate_square_matrix(None);
// Matrix A --- stored in GPU accessible memory, CPU can't access it
let (matrix_a_buf, _) = ImmutableBuffer::from_iter(matrix_a, BufferUsage::all(), queue.clone())
.expect("failed to create uniform buffer");
// Matrix B --- stored in GPU accessible memory, CPU can't access it
let (matrix_b_buf, _) = ImmutableBuffer::from_iter(matrix_b, BufferUsage::all(), queue.clone())
.expect("failed to create uniform buffer");
// Matrix C --- resulting matrix can be accessed by both CPU, GPU
let matrix_c_buf =
CpuAccessibleBuffer::from_iter(logical_device.clone(), BufferUsage::all(), false, matrix_c)
.expect("failed to create storage buffer");
// loading compute shader, including shader compilation
// abstracted with macro!
let shader = cs::Shader::load(logical_device.clone()).unwrap();
// preparing compute pipeline
let compute_pipeline = Arc::new(
// adding descriptors as per layout, into compute pipeline
let layout = compute_pipeline.layout().descriptor_set_layout(0).unwrap();
let set = Arc::new(
// create command buffer & start recording commands in it
let mut builder = AutoCommandBufferBuilder::primary(
// only single command recorded in command buffer
[1024 / 8, 1024 / 4, 1],
// ending command recording
let command_buffer =;
// Computing Matrix Multiplication on GPU
let start = Instant::now();
let finished = command_buffer.execute(queue.clone()).unwrap();
let gpu_tm = start.elapsed();
println!("GPU matrix multiply: {:?}", gpu_tm);
let r_matrix_a = generate_square_matrix(Some(13)).collect::<Vec<i32>>();
let r_matrix_b = generate_square_matrix(Some(17)).collect::<Vec<i32>>();
// reading GPU-computed matrix multiplication result
let gpu_result =;
// Computing Matrix Multiplication on CPU, and asserting !
let start = Instant::now();
for i in 0..1024 {
for j in 0..1024 {
let mut sum = 0i32;
for k in 0..1024 {
sum += r_matrix_a[i * 1024 + k] * r_matrix_b[k * 1024 + j];
assert_eq!(sum, gpu_result[i * 1024 + j]);
"CPU matrix multiply: {:?}\nSpeed Up: {}",
start.elapsed().as_nanos() / gpu_tm.as_nanos()
// reproducible random matrix generator, as single dimensional iterator
fn generate_square_matrix(seed: Option<u64>) -> Box<dyn std::iter::ExactSizeIterator<Item = i32>> {
match seed {
Some(seed) => {
let mut rng = StdRng::seed_from_u64(seed);
Box::new((0..N).map(move |_| rng.gen::<i32>()))
None => Box::new((0..N).map(|_| 0)),
mod cs {
// does shader compilation
vulkano_shaders::shader! {
ty: "compute",
path: "./matrix_multiply.glsl",
vulkan_version: "1.2",
Copy link

random comment, you should rework this to use VK_KHR_cooperative_matrix as a good example to compare it vs native shader

Thanks for the suggestion though I'm not actively maintaining it. You may send me a patch and I'll update the gist.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment