Last active
November 17, 2021 14:34
-
-
Save itzmeanjan/13b5efdff14f9c4877496947f1d9e449 to your computer and use it in GitHub Desktop.
⭐️ Parallel DFT on GPGPU, using SYCL DPC++ 🔥
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <CL/sycl.hpp> | |
#include <chrono> | |
#include <complex> | |
#include <iostream> | |
using namespace sycl; | |
typedef std::complex<double> cmplx; | |
constexpr uint N = 1 << 10; | |
constexpr uint B = 1 << 5; | |
constexpr double PI = 3.141592653589793; | |
constexpr double THETA = (-2.0 * PI) / (double)N; | |
sycl::event compute_dft_matrix(queue &q, cmplx *const matrix) { | |
buffer<cmplx, 2> b_matrix{matrix, range<2>{N, N}}; | |
auto evt = q.submit([&](handler &h) { | |
accessor<cmplx, 2, access::mode::write, access::target::global_buffer> | |
acc_b_matrix{b_matrix, h, noinit}; | |
h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{1, B}}, | |
[=](nd_item<2> it) { | |
const uint r = it.get_global_id(0); | |
const uint c = it.get_global_id(1); | |
cmplx w = cmplx(std::cos(THETA), std::sin(THETA)); | |
acc_b_matrix[r][c] = std::pow(w, r * c); | |
}); | |
}); | |
return evt; | |
} | |
sycl::event perform_multiplication(queue &q, const double *src_vec, | |
const cmplx *matrix, cmplx *const dst_vec) { | |
buffer<double, 1> b_src_vec{src_vec, range<1>{N}}; | |
buffer<cmplx, 1> b_matrix(matrix, range<1>{N * N}); | |
buffer<cmplx, 1> b_dst_vec{dst_vec, range<1>{N}}; | |
auto evt = q.submit([&](handler &h) { | |
accessor<double, 1, access::mode::read, access::target::global_buffer> | |
acc_b_src_vec{b_src_vec, h}; | |
accessor<cmplx, 1, access::mode::read, access::target::global_buffer> | |
acc_b_matrix(b_matrix, h); | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_b_dst_vec{b_dst_vec, h, noinit}; | |
accessor<cmplx, 1, access::mode::read_write, access::target::local> | |
acc_local(range<1>{B}, h); | |
h.parallel_for(nd_range<1>{{N}, {B}}, [=](nd_item<1> it) { | |
const size_t r = it.get_global_id(0); | |
const size_t l = it.get_local_id(0); | |
cmplx sum = cmplx(0., 0.); | |
for (size_t k = 0; k < N; k += B) { | |
acc_local[l] = acc_b_src_vec[k + l]; | |
it.barrier(); | |
for (uint k_ = 0; k_ < B; k_++) { | |
sum += acc_local[k_] * acc_b_matrix[r * N + (k + k_)]; | |
} | |
it.barrier(); | |
} | |
acc_b_dst_vec[r] = sum; | |
}); | |
}); | |
return evt; | |
} | |
int64_t compute_dft_v0(queue &q, const double *src_vec, cmplx *const dst_vec) { | |
cmplx *matrix = (cmplx *)malloc(sizeof(cmplx) * N * N); | |
memset(matrix, 0, sizeof(cmplx) * N * N); | |
std::chrono::_V2::steady_clock::time_point start = | |
std::chrono::steady_clock::now(); | |
auto evt = compute_dft_matrix(q, matrix); | |
evt.wait(); | |
evt = perform_multiplication(q, src_vec, matrix, dst_vec); | |
evt.wait(); | |
std::chrono::_V2::steady_clock::time_point end = | |
std::chrono::steady_clock::now(); | |
free(matrix); | |
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start) | |
.count(); | |
} | |
int64_t compute_dft_v1(queue &q, const double *src_vec, cmplx *const dst_vec) { | |
cmplx *matrix = (cmplx *)malloc(sizeof(cmplx) * N * N); | |
memset(matrix, 0, sizeof(cmplx) * N * N); | |
buffer<cmplx, 2> b_matrix{matrix, range<2>{N, N}}; | |
buffer<double, 1> b_src{src_vec, range<1>{N}}; | |
buffer<cmplx, 1> b_dst{dst_vec, range<1>{N}}; | |
std::chrono::_V2::steady_clock::time_point start = | |
std::chrono::steady_clock::now(); | |
q.submit([&](handler &h) { | |
accessor<cmplx, 2, access::mode::read_write, access::target::global_buffer> | |
acc_b_matrix{b_matrix, h, noinit}; | |
accessor<double, 1, access::mode::read, access::target::global_buffer> | |
acc_b_src{b_src, h}; | |
h.parallel_for( | |
nd_range<2>{range<2>{N, N}, range<2>{B, 1}}, [=](nd_item<2> it) { | |
sycl::ONEAPI::sub_group sg = it.get_sub_group(); | |
const uint r = it.get_global_id(0); | |
const uint c = it.get_global_id(1); | |
cmplx w = cmplx(std::cos(THETA), std::sin(THETA)); | |
acc_b_matrix[r][c] = std::pow(w, r * c); | |
acc_b_matrix[r][c] *= sycl::ONEAPI::broadcast(sg, acc_b_src[c]); | |
}); | |
}); | |
auto evt = q.submit([&](handler &h) { | |
accessor<cmplx, 2, access::mode::read, access::target::global_buffer> | |
acc_b_matrix{b_matrix, h}; | |
accessor<cmplx, 1, access::mode::read_write, access::target::global_buffer> | |
acc_b_dst{b_dst, h}; | |
h.parallel_for(nd_range<1>{range<1>{N}, range<1>{B}}, [=](nd_item<1> it) { | |
const size_t r = it.get_global_id(0); | |
cmplx sum = cmplx{0., 0.}; | |
for (uint i = 0; i < N; i++) { | |
sum += acc_b_matrix[r][i]; | |
} | |
acc_b_dst[r] = sum; | |
}); | |
}); | |
evt.wait(); | |
std::chrono::_V2::steady_clock::time_point end = | |
std::chrono::steady_clock::now(); | |
free(matrix); | |
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start) | |
.count(); | |
} | |
void fill(double *const data) { | |
for (uint i = 0; i < N; i++) { | |
*(data + i) = i + 1; | |
} | |
} | |
int main(int argc, char **argv) { | |
device d{default_selector{}}; | |
queue q{d}; | |
std::cout << "running on " << d.get_info<info::device::name>() << std::endl; | |
double *src_vec = (double *)malloc(sizeof(double) * N); | |
cmplx *dst_vec = (cmplx *)malloc(sizeof(cmplx) * N); | |
fill(src_vec); | |
memset(dst_vec, 0, sizeof(cmplx) * N); | |
int64_t ts = compute_dft_v0(q, src_vec, dst_vec); | |
std::cout << "dft v0 ✅, in " << ts << " ms" << std::endl; | |
memset(dst_vec, 0, sizeof(cmplx) * N); | |
ts = compute_dft_v0(q, src_vec, dst_vec); | |
std::cout << "dft v1 ✅, in " << ts << " ms" << std::endl; | |
free(src_vec); | |
free(dst_vec); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment