Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Parallel LU Decomposition on GPGPU, using SYCL DPC++ 🔆
#include <CL/sycl.hpp>
#include <chrono>
#include <iostream>
#include <random>
using namespace sycl;
constexpr uint N = 1 << 10;
constexpr uint B = 1 << 5;
sycl::event identity_matrix(queue &q, buffer<float, 2> &matrix) {
auto evt = q.submit([&](handler &h) {
accessor<float, 2, access::mode::write, access::target::global_buffer>
acc_matrix{matrix, h, noinit};
h.parallel_for<class kernelIdentity>(
nd_range<2>{range<2>{N, N}, range<2>{1, B}}, [=](nd_item<2> it) {
const size_t r = it.get_global_id(0);
const size_t c = it.get_global_id(1);
if (r == c) {
acc_matrix[r][c] = 1.f;
} else {
acc_matrix[r][c] = 0.f;
}
});
});
return evt;
}
int64_t lu_decomposition(queue &q, const float *matrix, float *const ldiag,
float *const udiag) {
float *matrix_ = (float *)malloc(sizeof(float) * N * N);
memcpy(matrix_, matrix, sizeof(float) * N * N);
buffer<float, 2> b_matrix{matrix_, range<2>{N, N}};
buffer<float, 2> b_ldiag{ldiag, range<2>{N, N}};
buffer<float, 2> b_udiag{udiag, range<2>{N, N}};
std::chrono::_V2::steady_clock::time_point start =
std::chrono::steady_clock::now();
auto evt = identity_matrix(q, b_ldiag);
evt.wait();
for (size_t i = 0; i < N - 1; i++) {
q.submit([&](handler &h) {
accessor<float, 2, access::mode::read, access::target::global_buffer>
acc_matrix{b_matrix, h};
accessor<float, 2, access::mode::write, access::target::global_buffer>
acc_ldiag{b_ldiag, h};
h.parallel_for<class kernelLDiagonal>(
nd_range<1>{N, B}, [=](nd_item<1> it) {
const size_t r = it.get_global_id(0);
if (r > i) {
acc_ldiag[r][i] = acc_matrix[r][i] / acc_matrix[i][i];
}
});
});
q.submit([&](handler &h) {
accessor<float, 2, access::mode::read, access::target::global_buffer>
acc_matrix{b_matrix, h};
accessor<float, 2, access::mode::write, access::target::global_buffer>
acc_udiag{b_udiag, h};
h.parallel_for<class kernelUDiagonal>(
nd_range<2>{range<2>{N, N}, range<2>{1, B}}, [=](nd_item<2> it) {
const size_t r = it.get_global_id(0);
const size_t c = it.get_global_id(1);
if (r > i && c >= i) {
acc_udiag[r][c] =
acc_matrix[r][c] -
(acc_matrix[i][c] * (acc_matrix[r][i] / acc_matrix[i][i]));
} else {
acc_udiag[r][c] = acc_matrix[r][c];
}
});
});
auto evt = q.submit([&](handler &h) {
accessor<float, 2, access::mode::write, access::target::global_buffer>
acc_matrix{b_matrix, h};
accessor<float, 2, access::mode::read, access::target::global_buffer>
acc_udiag{b_udiag, h};
h.copy(acc_udiag, acc_matrix);
});
evt.wait();
}
std::chrono::_V2::steady_clock::time_point end =
std::chrono::steady_clock::now();
std::free(matrix_);
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
}
// when running with example matrix, consider
// setting N = 4, B= 4
//
// below is a 4x4 matrix
void example_matrix(float *const matrix) {
matrix[0 * N + 0] = 2.f;
matrix[0 * N + 1] = 4.f;
matrix[0 * N + 2] = 3.f;
matrix[0 * N + 3] = 5.f;
matrix[1 * N + 0] = -4.f;
matrix[1 * N + 1] = -7.f;
matrix[1 * N + 2] = -5.f;
matrix[1 * N + 3] = -8.f;
matrix[2 * N + 0] = 6.f;
matrix[2 * N + 1] = 8.f;
matrix[2 * N + 2] = 2.f;
matrix[2 * N + 3] = 9.f;
matrix[3 * N + 0] = 4.f;
matrix[3 * N + 1] = 9.f;
matrix[3 * N + 2] = -2.f;
matrix[3 * N + 3] = 14.f;
}
void random_matrix(float *const adj_matrix) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(0.f, 1.f);
for (uint i = 0; i < N; i++) {
for (uint j = 0; j < N; j++) {
adj_matrix[i * N + j] = dis(gen);
}
}
}
int main(int argc, char **argv) {
device d{default_selector{}};
queue q{d};
std::cout << "running on " << d.get_info<info::device::name>() << std::endl;
float *matrix = (float *)malloc(sizeof(float) * N * N);
float *ldiag = (float *)malloc(sizeof(float) * N * N);
float *udiag = (float *)malloc(sizeof(float) * N * N);
memset(matrix, 0, sizeof(float) * N * N);
memset(ldiag, 0, sizeof(float) * N * N);
memset(udiag, 0, sizeof(float) * N * N);
random_matrix(matrix);
int64_t ts = lu_decomposition(q, matrix, ldiag, udiag);
std::cout << "LU decomposition: " << ts << " ms" << std::endl;
float max_dev = 0.f;
for (uint i = 0; i < N; i++) {
for (uint j = 0; j < N; j++) {
float sum = 0.f;
for (uint k = 0; k < N; k++) {
sum += ldiag[i * N + k] * udiag[k * N + j];
}
float dev = std::abs(matrix[i * N + j] - sum);
max_dev = std::max(dev, max_dev);
}
}
std::cout << "max deviation: " << max_dev << std::endl;
std::free(matrix);
std::free(ldiag);
std::free(udiag);
return 0;
}
@itzmeanjan
Copy link
Author

itzmeanjan commented Oct 11, 2021

Background

This program is supposed to accompany my post on SIMD implementation of LU factorization of square matrix A, such that

A = L x U

You may want to read: https://itzmeanjan.in/pages/parallel-lu-decomposition.html

Usage

  • Download gist, unzip
  • Make sure you've Intel oneAPI base toolkit installed. More here.
$ dpcpp --version
Intel(R) oneAPI DPC++/C++ Compiler 2021.4.0 (2021.4.0.20210924)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /glob/development-tools/versions/oneapi/2021.4/inteloneapi/compiler/2021.4.0/linux/bin
  • Compile & run
$ dpcpp -fsycl lu_decomposition.cpp && ./a.out

running on Intel Xeon Processor (Skylake, IBRS)
LU decomposition: 3840 ms
max deviation: 0.0765064

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment