Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
πŸ”₯ Solving System of Linear Equations on GPGPU, using SYCL DPC++ ( GAUSS-JORDAN Method ) πŸ‘Œ
#include <CL/sycl.hpp>
#include <chrono>
#include <iostream>
#include <random>
using namespace sycl;
constexpr uint N = 1024;
constexpr uint B = 32;
void random_fill_up(float *const data, const uint count) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(0.f, 1.f);
for (uint i = 0; i < count; i++) {
*(data + i) = dis(gen);
}
}
int64_t gauss_jordan(queue &q, float *const aug_mat_in,
float *const aug_mat_out) {
float *aug_mat_in_ = (float *)malloc(sizeof(float) * N * (N + 1));
memcpy(aug_mat_in_, aug_mat_in, sizeof(float) * (N + 1) * N);
buffer<float, 2> b_aug_mat_in(aug_mat_in_, range<2>{N, N + 1});
buffer<float, 2> b_aug_mat_out(aug_mat_out, range<2>{N, N + 1});
std::chrono::_V2::steady_clock::time_point start =
std::chrono::steady_clock::now();
for (uint j = 0; j < N; j++) {
q.submit([&](handler &h) {
accessor<float, 2, access::mode::read, access::target::global_buffer>
acc_mat_in(b_aug_mat_in, h);
accessor<float, 2, access::mode::write, access::target::global_buffer>
acc_mat_out(b_aug_mat_out, h);
h.parallel_for(nd_range<2>{{N, N + 1}, {B, 1}}, [=](nd_item<2> it) {
const size_t i = it.get_global_id(0);
const size_t k = it.get_global_id(1);
acc_mat_out[i][k] = acc_mat_in[i][k];
it.barrier();
if (i == j || k < j || k >= (N + 1)) {
return;
}
if (j == k) {
acc_mat_out[i][k] = acc_mat_in[i][k] - acc_mat_in[i][j];
} else {
acc_mat_out[i][k] =
acc_mat_in[i][k] -
(acc_mat_in[i][j] / acc_mat_in[j][j]) * acc_mat_in[j][k];
}
});
});
q.submit([&](handler &h) {
accessor<float, 2, access::mode::write, access::target::global_buffer>
acc_mat_in(b_aug_mat_in, h);
accessor<float, 2, access::mode::read, access::target::global_buffer>
acc_mat_out(b_aug_mat_out, h);
h.copy(acc_mat_out, acc_mat_in);
});
}
q.submit([&](handler &h) {
accessor<float, 2, access::mode::read_write, access::target::global_buffer>
acc_mat_in(b_aug_mat_in, h);
h.parallel_for(nd_range<2>{range<2>{N, 1}, range<2>{B, 1}},
[=](nd_item<2> it) {
const uint r = it.get_global_id(0);
acc_mat_in[r][N] /= acc_mat_in[r][r];
});
});
auto evt = q.submit([&](handler &h) {
accessor<float, 2, access::mode::read, access::target::global_buffer>
acc_mat_in(b_aug_mat_in, h);
accessor<float, 2, access::mode::write, access::target::global_buffer>
acc_mat_out(b_aug_mat_out, h);
h.copy(acc_mat_in, acc_mat_out);
});
evt.wait();
std::chrono::_V2::steady_clock::time_point end =
std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
}
void display(const float *data, const uint w, const uint h) {
for (uint i = 0; i < h; i++) {
for (uint j = 0; j < w; j++) {
std::cout << data[i * w + j] << "\t";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
void display_solution(const float *data) {
std::cout << "( ";
for (uint i = 0; i < N; i++) {
std::cout << data[i * (N + 1) + N] << ", ";
}
std::cout << " )" << std::endl;
}
void extract_matrix(const float *src, float *const dst) {
for (uint i = 0; i < N; i++) {
for (uint j = 0; j < N + 1; j++) {
if (j == N) {
break;
}
*(dst + i * N + j) = *(src + i * (N + 1) + j);
}
}
}
void extract_vector(const float *src, float *const dst) {
for (uint i = 0; i < N; i++) {
*(dst + i) = *(src + i * (N + 1) + N);
}
}
void multiply_matrix_vector(queue &q, const float *matrix, const float *vector,
float *const result) {
buffer<float, 1> b_matrix(matrix, range<1>{N * N});
buffer<float, 1> b_vector(vector, range<1>{N});
buffer<float, 1> b_result(result, range<1>{N});
auto event = q.submit([&](handler &h) {
accessor<float, 1, access::mode::read, access::target::global_buffer>
acc_matrix(b_matrix, h);
accessor<float, 1, access::mode::read, access::target::global_buffer>
acc_vector(b_vector, h);
accessor<float, 1, access::mode::write, access::target::global_buffer>
acc_result(b_result, h);
accessor<float, 1, access::mode::read_write, access::target::local>
acc_local(range<1>{B}, h);
h.parallel_for(nd_range<1>{{N}, {B}}, [=](nd_item<1> it) {
const size_t r = it.get_global_id(0);
const size_t l = it.get_local_id(0);
float sum = 0.f;
for (size_t k = 0; k < N; k += B) {
acc_local[l] = acc_vector[k + l];
it.barrier();
for (uint k_ = 0; k_ < B; k_++) {
sum += acc_local[k_] * acc_matrix[r * N + (k + k_)];
}
it.barrier();
}
acc_result[r] = sum;
});
});
event.wait();
}
void prepare_augmented_matrix(const float *matrix_a, const float *vector_b,
float *const aug_mat) {
for (uint i = 0; i < N; i++) {
for (uint j = 0; j < N + 1; j++) {
if (j == N) {
aug_mat[i * (N + 1) + j] = vector_b[i];
break;
}
aug_mat[i * (N + 1) + j] = matrix_a[i * N + j];
}
}
}
int main(int argc, char **argv) {
device d{default_selector{}};
queue q{d};
std::cout << "running on " << d.get_info<info::device::name>() << std::endl;
float *matrix_a = (float *)malloc(sizeof(float) * N * N);
float *vector_x = (float *)malloc(sizeof(float) * N * 1);
float *vector_b = (float *)malloc(sizeof(float) * N * 1);
random_fill_up(matrix_a, N * N);
random_fill_up(vector_x, N * 1);
memset(vector_b, 0, sizeof(float) * N);
multiply_matrix_vector(q, matrix_a, vector_x, vector_b);
float *aug_matrix_in = (float *)malloc(sizeof(float) * N * (N + 1));
float *aug_matrix_out = (float *)malloc(sizeof(float) * N * (N + 1));
prepare_augmented_matrix(matrix_a, vector_b, aug_matrix_in);
memset(aug_matrix_out, 0, sizeof(float) * (N + 1) * N);
int64_t ts = gauss_jordan(q, aug_matrix_in, aug_matrix_out);
float *vector_solved = (float *)malloc(sizeof(float) * N * 1);
extract_vector(aug_matrix_out, vector_solved);
float max_diff = 0.f;
for (uint i = 0; i < N; i++) {
float diff = std::fabs(*(vector_solved + i) - *(vector_x + i));
if (diff > max_diff) {
max_diff = diff;
}
}
std::cout << "solved in " << ts << " ms\t|\tmax error " << max_diff << std::endl;
return 0;
}
@itzmeanjan
Copy link
Author

itzmeanjan commented Oct 3, 2021

Background

This is an implementation for solving system of linear equations with large N (say >= 1024), written in SYCL DPC++. It accompanies blog post, I wrote.

Usage

  • Make sure you've installed oneAPI basekit toolchain
  • I'm using dpcpp for compiling above program
$ dpcpp --version

Intel(R) oneAPI DPC++/C++ Compiler 2021.3.0 (2021.3.0.20210619)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/intel/oneapi/compiler/2021.3.0/linux/bin
  • Compile & run
$ dpcpp -fsycl gauss_jordan.cpp && ./a.out

running on Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
solved in 5156 ms	|	max error 0.0918417

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment