Skip to content

Instantly share code, notes, and snippets.

@itzmeanjan
Last active October 15, 2021 06:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save itzmeanjan/82e57b64420c417d55133bd443aebdf6 to your computer and use it in GitHub Desktop.
Save itzmeanjan/82e57b64420c417d55133bd443aebdf6 to your computer and use it in GitHub Desktop.
πŸƒβ€β™‚οΈ Fast, Parallel LU Factorization, using SYCL DPC++ πŸ…
#include <CL/sycl.hpp>
#include <chrono>
#include <iostream>
#include <random>
using namespace sycl;
constexpr uint N = 1 << 10;
constexpr uint B = 1 << 6;
sycl::event identity_matrix(queue &q, buffer<float, 2> &matrix) {
auto evt = q.submit([&](handler &h) {
accessor<float, 2, access::mode::write, access::target::global_buffer>
acc_matrix{matrix, h};
h.parallel_for<class kernelIdentity>(nd_range<1>{range<1>{N}, range<1>{B}},
[=](nd_item<1> it) {
const size_t r = it.get_global_id(0);
acc_matrix[r][r] = 1.f;
});
});
return evt;
}
int64_t lu_decomposition(queue &q, const float *matrix, float *const ldiag,
float *const udiag) {
memset(ldiag, 0, sizeof(float) * N * N);
memcpy(udiag, matrix, sizeof(float) * N * N);
buffer<float, 2> b_ldiag{ldiag, range<2>{N, N}};
buffer<float, 2> b_udiag{udiag, range<2>{N, N}};
std::chrono::_V2::steady_clock::time_point start =
std::chrono::steady_clock::now();
identity_matrix(q, b_ldiag);
for (size_t i = 0; i < N - 1; i++) {
q.submit([&](handler &h) {
accessor<float, 2, access::mode::read, access::target::global_buffer>
acc_udiag{b_udiag, h};
accessor<float, 2, access::mode::write, access::target::global_buffer>
acc_ldiag{b_ldiag, h};
accessor<float, 1, access::mode::read_write, access::target::local>
acc_lds{range<1>{1}, h};
h.parallel_for<class kernelLDiagonal>(
nd_range<1>{range<1>{N - (i + 1)}, range<1>{B}, id<1>{i + 1}},
[=](nd_item<1> it) {
const sycl::ONEAPI::sub_group sg = it.get_sub_group();
if (sycl::ONEAPI::leader(sg)) {
acc_lds[0] = acc_udiag[i][i];
}
sg.barrier();
const size_t r = it.get_global_id(0);
acc_ldiag[r][i] = acc_udiag[r][i] / acc_lds[0];
});
});
q.submit([&](handler &h) {
accessor<float, 2, access::mode::read_write,
access::target::global_buffer>
acc_udiag{b_udiag, h};
accessor<float, 1, access::mode::read_write, access::target::local>
acc_lds{range<1>{1}, h};
h.parallel_for<class kernelUDiagonalA>(
nd_range<2>{range<2>{N - (i + 1), N - (i + 1)}, range<2>{1, B},
id<2>{i + 1, i + 1}},
[=](nd_item<2> it) {
const sycl::ONEAPI::sub_group sg = it.get_sub_group();
if (sycl::ONEAPI::leader(sg)) {
acc_lds[0] = acc_udiag[i][i];
}
sg.barrier();
const size_t r = it.get_global_id(0);
const size_t c = it.get_global_id(1);
acc_udiag[r][c] =
acc_udiag[r][c] -
(acc_udiag[i][c] * (acc_udiag[r][i] / acc_lds[0]));
});
});
q.submit([&](handler &h) {
accessor<float, 2, access::mode::read_write,
access::target::global_buffer>
acc_udiag{b_udiag, h};
accessor<float, 1, access::mode::read_write, access::target::local>
acc_lds{range<1>{1}, h};
h.parallel_for<class kernelUDiagonalB>(
nd_range<1>{range<1>{N - (i + 1)}, range<1>{B}, id<1>{i + 1}},
[=](nd_item<1> it) {
const sycl::ONEAPI::sub_group sg = it.get_sub_group();
if (sycl::ONEAPI::leader(sg)) {
acc_lds[0] = acc_udiag[i][i];
}
sg.barrier();
const size_t r = it.get_global_id(0);
acc_udiag[r][i] =
acc_udiag[r][i] -
(acc_udiag[i][i] * (acc_udiag[r][i] / acc_lds[0]));
});
});
}
std::chrono::_V2::steady_clock::time_point end =
std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
}
// when running with example matrix, consider
// setting N = 4, B= 4
//
// below is a 4x4 matrix
void example_matrix(float *const matrix) {
matrix[0 * N + 0] = 2.f;
matrix[0 * N + 1] = 4.f;
matrix[0 * N + 2] = 3.f;
matrix[0 * N + 3] = 5.f;
matrix[1 * N + 0] = -4.f;
matrix[1 * N + 1] = -7.f;
matrix[1 * N + 2] = -5.f;
matrix[1 * N + 3] = -8.f;
matrix[2 * N + 0] = 6.f;
matrix[2 * N + 1] = 8.f;
matrix[2 * N + 2] = 2.f;
matrix[2 * N + 3] = 9.f;
matrix[3 * N + 0] = 4.f;
matrix[3 * N + 1] = 9.f;
matrix[3 * N + 2] = -2.f;
matrix[3 * N + 3] = 14.f;
}
void random_matrix(float *const matrix) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(0.f, 1.f);
for (uint i = 0; i < N; i++) {
for (uint j = 0; j < N; j++) {
matrix[i * N + j] = dis(gen);
}
}
}
int main(int argc, char **argv) {
device d{default_selector{}};
queue q{d};
std::cout << "running on " << d.get_info<info::device::name>() << std::endl;
float *matrix = (float *)malloc(sizeof(float) * N * N);
float *ldiag = (float *)malloc(sizeof(float) * N * N);
float *udiag = (float *)malloc(sizeof(float) * N * N);
memset(matrix, 0, sizeof(float) * N * N);
memset(ldiag, 0, sizeof(float) * N * N);
memset(udiag, 0, sizeof(float) * N * N);
random_matrix(matrix);
int64_t ts = lu_decomposition(q, matrix, ldiag, udiag);
std::cout << "LU decomposition: " << ts << " ms" << std::endl;
float max_dev = 0.f;
for (uint i = 0; i < N; i++) {
for (uint j = 0; j < N; j++) {
float sum = 0.f;
for (uint k = 0; k < N; k++) {
sum += ldiag[i * N + k] * udiag[k * N + j];
}
float dev = std::abs(matrix[i * N + j] - sum);
max_dev = std::max(dev, max_dev);
}
}
std::cout << "max deviation: " << max_dev << std::endl;
std::free(matrix);
std::free(ldiag);
std::free(udiag);
return 0;
}
@itzmeanjan
Copy link
Author

Background

This is an attempt to improve performance of parallel LU factorization implementation, which I've already written here.

This program also accompanies post I wrote here.

Usage

  • Download/ Clone gist
  • Make sure you've Intel oneAPI basekit toolchain installed. Check here
  • Compile and run
$ dpcpp -fsycl improved_lu_decomposition.cpp && ./a.out

running on Intel Xeon Processor (Skylake, IBRS)
LU decomposition: 180 ms
max deviation: 0.318212

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment