This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <CL/sycl.hpp> | |
#include <chrono> | |
#include <iostream> | |
#include <random> | |
using namespace sycl; | |
constexpr uint N = 1024; | |
constexpr uint B = 32; | |
void random_fill_up(float *const data, const uint count) { | |
std::random_device rd; | |
std::mt19937 gen(rd()); | |
std::uniform_real_distribution<float> dis(0.f, 1.f); | |
for (uint i = 0; i < count; i++) { | |
*(data + i) = dis(gen); | |
} | |
} | |
int64_t gauss_jordan(queue &q, float *const aug_mat_in, | |
float *const aug_mat_out) { | |
float *aug_mat_in_ = (float *)malloc(sizeof(float) * N * (N + 1)); | |
memcpy(aug_mat_in_, aug_mat_in, sizeof(float) * (N + 1) * N); | |
buffer<float, 2> b_aug_mat_in(aug_mat_in_, range<2>{N, N + 1}); | |
buffer<float, 2> b_aug_mat_out(aug_mat_out, range<2>{N, N + 1}); | |
std::chrono::_V2::steady_clock::time_point start = | |
std::chrono::steady_clock::now(); | |
for (uint j = 0; j < N; j++) { | |
q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::read, access::target::global_buffer> | |
acc_mat_in(b_aug_mat_in, h); | |
accessor<float, 2, access::mode::write, access::target::global_buffer> | |
acc_mat_out(b_aug_mat_out, h); | |
h.parallel_for(nd_range<2>{{N, N + 1}, {B, 1}}, [=](nd_item<2> it) { | |
const size_t i = it.get_global_id(0); | |
const size_t k = it.get_global_id(1); | |
acc_mat_out[i][k] = acc_mat_in[i][k]; | |
it.barrier(); | |
if (i == j || k < j || k >= (N + 1)) { | |
return; | |
} | |
if (j == k) { | |
acc_mat_out[i][k] = acc_mat_in[i][k] - acc_mat_in[i][j]; | |
} else { | |
acc_mat_out[i][k] = | |
acc_mat_in[i][k] - | |
(acc_mat_in[i][j] / acc_mat_in[j][j]) * acc_mat_in[j][k]; | |
} | |
}); | |
}); | |
q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::write, access::target::global_buffer> | |
acc_mat_in(b_aug_mat_in, h); | |
accessor<float, 2, access::mode::read, access::target::global_buffer> | |
acc_mat_out(b_aug_mat_out, h); | |
h.copy(acc_mat_out, acc_mat_in); | |
}); | |
} | |
q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::read_write, access::target::global_buffer> | |
acc_mat_in(b_aug_mat_in, h); | |
h.parallel_for(nd_range<2>{range<2>{N, 1}, range<2>{B, 1}}, | |
[=](nd_item<2> it) { | |
const uint r = it.get_global_id(0); | |
acc_mat_in[r][N] /= acc_mat_in[r][r]; | |
}); | |
}); | |
auto evt = q.submit([&](handler &h) { | |
accessor<float, 2, access::mode::read, access::target::global_buffer> | |
acc_mat_in(b_aug_mat_in, h); | |
accessor<float, 2, access::mode::write, access::target::global_buffer> | |
acc_mat_out(b_aug_mat_out, h); | |
h.copy(acc_mat_in, acc_mat_out); | |
}); | |
evt.wait(); | |
std::chrono::_V2::steady_clock::time_point end = | |
std::chrono::steady_clock::now(); | |
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start) | |
.count(); | |
} | |
void display(const float *data, const uint w, const uint h) { | |
for (uint i = 0; i < h; i++) { | |
for (uint j = 0; j < w; j++) { | |
std::cout << data[i * w + j] << "\t"; | |
} | |
std::cout << std::endl; | |
} | |
std::cout << std::endl; | |
} | |
void display_solution(const float *data) { | |
std::cout << "( "; | |
for (uint i = 0; i < N; i++) { | |
std::cout << data[i * (N + 1) + N] << ", "; | |
} | |
std::cout << " )" << std::endl; | |
} | |
void extract_matrix(const float *src, float *const dst) { | |
for (uint i = 0; i < N; i++) { | |
for (uint j = 0; j < N + 1; j++) { | |
if (j == N) { | |
break; | |
} | |
*(dst + i * N + j) = *(src + i * (N + 1) + j); | |
} | |
} | |
} | |
void extract_vector(const float *src, float *const dst) { | |
for (uint i = 0; i < N; i++) { | |
*(dst + i) = *(src + i * (N + 1) + N); | |
} | |
} | |
void multiply_matrix_vector(queue &q, const float *matrix, const float *vector, | |
float *const result) { | |
buffer<float, 1> b_matrix(matrix, range<1>{N * N}); | |
buffer<float, 1> b_vector(vector, range<1>{N}); | |
buffer<float, 1> b_result(result, range<1>{N}); | |
auto event = q.submit([&](handler &h) { | |
accessor<float, 1, access::mode::read, access::target::global_buffer> | |
acc_matrix(b_matrix, h); | |
accessor<float, 1, access::mode::read, access::target::global_buffer> | |
acc_vector(b_vector, h); | |
accessor<float, 1, access::mode::write, access::target::global_buffer> | |
acc_result(b_result, h); | |
accessor<float, 1, access::mode::read_write, access::target::local> | |
acc_local(range<1>{B}, h); | |
h.parallel_for(nd_range<1>{{N}, {B}}, [=](nd_item<1> it) { | |
const size_t r = it.get_global_id(0); | |
const size_t l = it.get_local_id(0); | |
float sum = 0.f; | |
for (size_t k = 0; k < N; k += B) { | |
acc_local[l] = acc_vector[k + l]; | |
it.barrier(); | |
for (uint k_ = 0; k_ < B; k_++) { | |
sum += acc_local[k_] * acc_matrix[r * N + (k + k_)]; | |
} | |
it.barrier(); | |
} | |
acc_result[r] = sum; | |
}); | |
}); | |
event.wait(); | |
} | |
void prepare_augmented_matrix(const float *matrix_a, const float *vector_b, | |
float *const aug_mat) { | |
for (uint i = 0; i < N; i++) { | |
for (uint j = 0; j < N + 1; j++) { | |
if (j == N) { | |
aug_mat[i * (N + 1) + j] = vector_b[i]; | |
break; | |
} | |
aug_mat[i * (N + 1) + j] = matrix_a[i * N + j]; | |
} | |
} | |
} | |
int main(int argc, char **argv) { | |
device d{default_selector{}}; | |
queue q{d}; | |
std::cout << "running on " << d.get_info<info::device::name>() << std::endl; | |
float *matrix_a = (float *)malloc(sizeof(float) * N * N); | |
float *vector_x = (float *)malloc(sizeof(float) * N * 1); | |
float *vector_b = (float *)malloc(sizeof(float) * N * 1); | |
random_fill_up(matrix_a, N * N); | |
random_fill_up(vector_x, N * 1); | |
memset(vector_b, 0, sizeof(float) * N); | |
multiply_matrix_vector(q, matrix_a, vector_x, vector_b); | |
float *aug_matrix_in = (float *)malloc(sizeof(float) * N * (N + 1)); | |
float *aug_matrix_out = (float *)malloc(sizeof(float) * N * (N + 1)); | |
prepare_augmented_matrix(matrix_a, vector_b, aug_matrix_in); | |
memset(aug_matrix_out, 0, sizeof(float) * (N + 1) * N); | |
int64_t ts = gauss_jordan(q, aug_matrix_in, aug_matrix_out); | |
float *vector_solved = (float *)malloc(sizeof(float) * N * 1); | |
extract_vector(aug_matrix_out, vector_solved); | |
float max_diff = 0.f; | |
for (uint i = 0; i < N; i++) { | |
float diff = std::fabs(*(vector_solved + i) - *(vector_x + i)); | |
if (diff > max_diff) { | |
max_diff = diff; | |
} | |
} | |
std::cout << "solved in " << ts << " ms\t|\tmax error " << max_diff << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Background
This is an implementation for solving system of linear equations with large N (say >= 1024), written in SYCL DPC++. It accompanies blog post, I wrote.
Usage
dpcpp
for compiling above program