This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <CL/sycl.hpp> | |
#include <chrono> | |
#include <iostream> | |
#include <random> | |
using namespace sycl; | |
constexpr uint N = 1 << 10; | |
constexpr uint B = 1 << 5; | |
sycl::event identity_matrix(queue &q, buffer<float, 2> &matrix) { | |
auto evt = q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::write, access::target::global_buffer> | |
acc_matrix{matrix, h, noinit}; | |
h.parallel_for<class kernelIdentity>( | |
nd_range<2>{range<2>{N, N}, range<2>{1, B}}, [=](nd_item<2> it) { | |
const size_t r = it.get_global_id(0); | |
const size_t c = it.get_global_id(1); | |
if (r == c) { | |
acc_matrix[r][c] = 1.f; | |
} else { | |
acc_matrix[r][c] = 0.f; | |
} | |
}); | |
}); | |
return evt; | |
} | |
int64_t lu_decomposition(queue &q, const float *matrix, float *const ldiag, | |
float *const udiag) { | |
float *matrix_ = (float *)malloc(sizeof(float) * N * N); | |
memcpy(matrix_, matrix, sizeof(float) * N * N); | |
buffer<float, 2> b_matrix{matrix_, range<2>{N, N}}; | |
buffer<float, 2> b_ldiag{ldiag, range<2>{N, N}}; | |
buffer<float, 2> b_udiag{udiag, range<2>{N, N}}; | |
std::chrono::_V2::steady_clock::time_point start = | |
std::chrono::steady_clock::now(); | |
auto evt = identity_matrix(q, b_ldiag); | |
evt.wait(); | |
for (size_t i = 0; i < N - 1; i++) { | |
q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::read, access::target::global_buffer> | |
acc_matrix{b_matrix, h}; | |
accessor<float, 2, access::mode::write, access::target::global_buffer> | |
acc_ldiag{b_ldiag, h}; | |
h.parallel_for<class kernelLDiagonal>( | |
nd_range<1>{N, B}, [=](nd_item<1> it) { | |
const size_t r = it.get_global_id(0); | |
if (r > i) { | |
acc_ldiag[r][i] = acc_matrix[r][i] / acc_matrix[i][i]; | |
} | |
}); | |
}); | |
q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::read, access::target::global_buffer> | |
acc_matrix{b_matrix, h}; | |
accessor<float, 2, access::mode::write, access::target::global_buffer> | |
acc_udiag{b_udiag, h}; | |
h.parallel_for<class kernelUDiagonal>( | |
nd_range<2>{range<2>{N, N}, range<2>{1, B}}, [=](nd_item<2> it) { | |
const size_t r = it.get_global_id(0); | |
const size_t c = it.get_global_id(1); | |
if (r > i && c >= i) { | |
acc_udiag[r][c] = | |
acc_matrix[r][c] - | |
(acc_matrix[i][c] * (acc_matrix[r][i] / acc_matrix[i][i])); | |
} else { | |
acc_udiag[r][c] = acc_matrix[r][c]; | |
} | |
}); | |
}); | |
auto evt = q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::write, access::target::global_buffer> | |
acc_matrix{b_matrix, h}; | |
accessor<float, 2, access::mode::read, access::target::global_buffer> | |
acc_udiag{b_udiag, h}; | |
h.copy(acc_udiag, acc_matrix); | |
}); | |
evt.wait(); | |
} | |
std::chrono::_V2::steady_clock::time_point end = | |
std::chrono::steady_clock::now(); | |
std::free(matrix_); | |
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start) | |
.count(); | |
} | |
// when running with example matrix, consider | |
// setting N = 4, B= 4 | |
// | |
// below is a 4x4 matrix | |
void example_matrix(float *const matrix) { | |
matrix[0 * N + 0] = 2.f; | |
matrix[0 * N + 1] = 4.f; | |
matrix[0 * N + 2] = 3.f; | |
matrix[0 * N + 3] = 5.f; | |
matrix[1 * N + 0] = -4.f; | |
matrix[1 * N + 1] = -7.f; | |
matrix[1 * N + 2] = -5.f; | |
matrix[1 * N + 3] = -8.f; | |
matrix[2 * N + 0] = 6.f; | |
matrix[2 * N + 1] = 8.f; | |
matrix[2 * N + 2] = 2.f; | |
matrix[2 * N + 3] = 9.f; | |
matrix[3 * N + 0] = 4.f; | |
matrix[3 * N + 1] = 9.f; | |
matrix[3 * N + 2] = -2.f; | |
matrix[3 * N + 3] = 14.f; | |
} | |
void random_matrix(float *const adj_matrix) { | |
std::random_device rd; | |
std::mt19937 gen(rd()); | |
std::uniform_real_distribution<float> dis(0.f, 1.f); | |
for (uint i = 0; i < N; i++) { | |
for (uint j = 0; j < N; j++) { | |
adj_matrix[i * N + j] = dis(gen); | |
} | |
} | |
} | |
int main(int argc, char **argv) { | |
device d{default_selector{}}; | |
queue q{d}; | |
std::cout << "running on " << d.get_info<info::device::name>() << std::endl; | |
float *matrix = (float *)malloc(sizeof(float) * N * N); | |
float *ldiag = (float *)malloc(sizeof(float) * N * N); | |
float *udiag = (float *)malloc(sizeof(float) * N * N); | |
memset(matrix, 0, sizeof(float) * N * N); | |
memset(ldiag, 0, sizeof(float) * N * N); | |
memset(udiag, 0, sizeof(float) * N * N); | |
random_matrix(matrix); | |
int64_t ts = lu_decomposition(q, matrix, ldiag, udiag); | |
std::cout << "LU decomposition: " << ts << " ms" << std::endl; | |
float max_dev = 0.f; | |
for (uint i = 0; i < N; i++) { | |
for (uint j = 0; j < N; j++) { | |
float sum = 0.f; | |
for (uint k = 0; k < N; k++) { | |
sum += ldiag[i * N + k] * udiag[k * N + j]; | |
} | |
float dev = std::abs(matrix[i * N + j] - sum); | |
max_dev = std::max(dev, max_dev); | |
} | |
} | |
std::cout << "max deviation: " << max_dev << std::endl; | |
std::free(matrix); | |
std::free(ldiag); | |
std::free(udiag); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Background
This program is supposed to accompany my post on SIMD implementation of LU factorization of square matrix A, such that
You may want to read: https://itzmeanjan.in/pages/parallel-lu-decomposition.html
Usage
$ dpcpp -fsycl lu_decomposition.cpp && ./a.out running on Intel Xeon Processor (Skylake, IBRS) LU decomposition: 3840 ms max deviation: 0.0765064