Last active
October 15, 2021 06:36
-
-
Save itzmeanjan/82e57b64420c417d55133bd443aebdf6 to your computer and use it in GitHub Desktop.
πββοΈ Fast, Parallel LU Factorization, using SYCL DPC++ π
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <CL/sycl.hpp> | |
#include <chrono> | |
#include <iostream> | |
#include <random> | |
using namespace sycl; | |
constexpr uint N = 1 << 10; | |
constexpr uint B = 1 << 6; | |
sycl::event identity_matrix(queue &q, buffer<float, 2> &matrix) { | |
auto evt = q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::write, access::target::global_buffer> | |
acc_matrix{matrix, h}; | |
h.parallel_for<class kernelIdentity>(nd_range<1>{range<1>{N}, range<1>{B}}, | |
[=](nd_item<1> it) { | |
const size_t r = it.get_global_id(0); | |
acc_matrix[r][r] = 1.f; | |
}); | |
}); | |
return evt; | |
} | |
int64_t lu_decomposition(queue &q, const float *matrix, float *const ldiag, | |
float *const udiag) { | |
memset(ldiag, 0, sizeof(float) * N * N); | |
memcpy(udiag, matrix, sizeof(float) * N * N); | |
buffer<float, 2> b_ldiag{ldiag, range<2>{N, N}}; | |
buffer<float, 2> b_udiag{udiag, range<2>{N, N}}; | |
std::chrono::_V2::steady_clock::time_point start = | |
std::chrono::steady_clock::now(); | |
identity_matrix(q, b_ldiag); | |
for (size_t i = 0; i < N - 1; i++) { | |
q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::read, access::target::global_buffer> | |
acc_udiag{b_udiag, h}; | |
accessor<float, 2, access::mode::write, access::target::global_buffer> | |
acc_ldiag{b_ldiag, h}; | |
accessor<float, 1, access::mode::read_write, access::target::local> | |
acc_lds{range<1>{1}, h}; | |
h.parallel_for<class kernelLDiagonal>( | |
nd_range<1>{range<1>{N - (i + 1)}, range<1>{B}, id<1>{i + 1}}, | |
[=](nd_item<1> it) { | |
const sycl::ONEAPI::sub_group sg = it.get_sub_group(); | |
if (sycl::ONEAPI::leader(sg)) { | |
acc_lds[0] = acc_udiag[i][i]; | |
} | |
sg.barrier(); | |
const size_t r = it.get_global_id(0); | |
acc_ldiag[r][i] = acc_udiag[r][i] / acc_lds[0]; | |
}); | |
}); | |
q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::read_write, | |
access::target::global_buffer> | |
acc_udiag{b_udiag, h}; | |
accessor<float, 1, access::mode::read_write, access::target::local> | |
acc_lds{range<1>{1}, h}; | |
h.parallel_for<class kernelUDiagonalA>( | |
nd_range<2>{range<2>{N - (i + 1), N - (i + 1)}, range<2>{1, B}, | |
id<2>{i + 1, i + 1}}, | |
[=](nd_item<2> it) { | |
const sycl::ONEAPI::sub_group sg = it.get_sub_group(); | |
if (sycl::ONEAPI::leader(sg)) { | |
acc_lds[0] = acc_udiag[i][i]; | |
} | |
sg.barrier(); | |
const size_t r = it.get_global_id(0); | |
const size_t c = it.get_global_id(1); | |
acc_udiag[r][c] = | |
acc_udiag[r][c] - | |
(acc_udiag[i][c] * (acc_udiag[r][i] / acc_lds[0])); | |
}); | |
}); | |
q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::read_write, | |
access::target::global_buffer> | |
acc_udiag{b_udiag, h}; | |
accessor<float, 1, access::mode::read_write, access::target::local> | |
acc_lds{range<1>{1}, h}; | |
h.parallel_for<class kernelUDiagonalB>( | |
nd_range<1>{range<1>{N - (i + 1)}, range<1>{B}, id<1>{i + 1}}, | |
[=](nd_item<1> it) { | |
const sycl::ONEAPI::sub_group sg = it.get_sub_group(); | |
if (sycl::ONEAPI::leader(sg)) { | |
acc_lds[0] = acc_udiag[i][i]; | |
} | |
sg.barrier(); | |
const size_t r = it.get_global_id(0); | |
acc_udiag[r][i] = | |
acc_udiag[r][i] - | |
(acc_udiag[i][i] * (acc_udiag[r][i] / acc_lds[0])); | |
}); | |
}); | |
} | |
std::chrono::_V2::steady_clock::time_point end = | |
std::chrono::steady_clock::now(); | |
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start) | |
.count(); | |
} | |
// when running with example matrix, consider | |
// setting N = 4, B= 4 | |
// | |
// below is a 4x4 matrix | |
void example_matrix(float *const matrix) { | |
matrix[0 * N + 0] = 2.f; | |
matrix[0 * N + 1] = 4.f; | |
matrix[0 * N + 2] = 3.f; | |
matrix[0 * N + 3] = 5.f; | |
matrix[1 * N + 0] = -4.f; | |
matrix[1 * N + 1] = -7.f; | |
matrix[1 * N + 2] = -5.f; | |
matrix[1 * N + 3] = -8.f; | |
matrix[2 * N + 0] = 6.f; | |
matrix[2 * N + 1] = 8.f; | |
matrix[2 * N + 2] = 2.f; | |
matrix[2 * N + 3] = 9.f; | |
matrix[3 * N + 0] = 4.f; | |
matrix[3 * N + 1] = 9.f; | |
matrix[3 * N + 2] = -2.f; | |
matrix[3 * N + 3] = 14.f; | |
} | |
void random_matrix(float *const matrix) { | |
std::random_device rd; | |
std::mt19937 gen(rd()); | |
std::uniform_real_distribution<float> dis(0.f, 1.f); | |
for (uint i = 0; i < N; i++) { | |
for (uint j = 0; j < N; j++) { | |
matrix[i * N + j] = dis(gen); | |
} | |
} | |
} | |
int main(int argc, char **argv) { | |
device d{default_selector{}}; | |
queue q{d}; | |
std::cout << "running on " << d.get_info<info::device::name>() << std::endl; | |
float *matrix = (float *)malloc(sizeof(float) * N * N); | |
float *ldiag = (float *)malloc(sizeof(float) * N * N); | |
float *udiag = (float *)malloc(sizeof(float) * N * N); | |
memset(matrix, 0, sizeof(float) * N * N); | |
memset(ldiag, 0, sizeof(float) * N * N); | |
memset(udiag, 0, sizeof(float) * N * N); | |
random_matrix(matrix); | |
int64_t ts = lu_decomposition(q, matrix, ldiag, udiag); | |
std::cout << "LU decomposition: " << ts << " ms" << std::endl; | |
float max_dev = 0.f; | |
for (uint i = 0; i < N; i++) { | |
for (uint j = 0; j < N; j++) { | |
float sum = 0.f; | |
for (uint k = 0; k < N; k++) { | |
sum += ldiag[i * N + k] * udiag[k * N + j]; | |
} | |
float dev = std::abs(matrix[i * N + j] - sum); | |
max_dev = std::max(dev, max_dev); | |
} | |
} | |
std::cout << "max deviation: " << max_dev << std::endl; | |
std::free(matrix); | |
std::free(ldiag); | |
std::free(udiag); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Background
This is an attempt to improve performance of parallel LU factorization implementation, which I've already written here.
This program also accompanies post I wrote here.
Usage
$ dpcpp -fsycl improved_lu_decomposition.cpp && ./a.out running on Intel Xeon Processor (Skylake, IBRS) LU decomposition: 180 ms max deviation: 0.318212