Skip to content

Instantly share code, notes, and snippets.

@itzmeanjan
Last active November 17, 2021 14:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save itzmeanjan/13b5efdff14f9c4877496947f1d9e449 to your computer and use it in GitHub Desktop.
Save itzmeanjan/13b5efdff14f9c4877496947f1d9e449 to your computer and use it in GitHub Desktop.
⭐️ Parallel DFT on GPGPU, using SYCL DPC++ 🔥
#include <CL/sycl.hpp>
#include <chrono>
#include <complex>
#include <iostream>
using namespace sycl;
typedef std::complex<double> cmplx;
constexpr uint N = 1 << 10;
constexpr uint B = 1 << 5;
constexpr double PI = 3.141592653589793;
constexpr double THETA = (-2.0 * PI) / (double)N;
sycl::event compute_dft_matrix(queue &q, cmplx *const matrix) {
buffer<cmplx, 2> b_matrix{matrix, range<2>{N, N}};
auto evt = q.submit([&](handler &h) {
accessor<cmplx, 2, access::mode::write, access::target::global_buffer>
acc_b_matrix{b_matrix, h, noinit};
h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{1, B}},
[=](nd_item<2> it) {
const uint r = it.get_global_id(0);
const uint c = it.get_global_id(1);
cmplx w = cmplx(std::cos(THETA), std::sin(THETA));
acc_b_matrix[r][c] = std::pow(w, r * c);
});
});
return evt;
}
sycl::event perform_multiplication(queue &q, const double *src_vec,
const cmplx *matrix, cmplx *const dst_vec) {
buffer<double, 1> b_src_vec{src_vec, range<1>{N}};
buffer<cmplx, 1> b_matrix(matrix, range<1>{N * N});
buffer<cmplx, 1> b_dst_vec{dst_vec, range<1>{N}};
auto evt = q.submit([&](handler &h) {
accessor<double, 1, access::mode::read, access::target::global_buffer>
acc_b_src_vec{b_src_vec, h};
accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
acc_b_matrix(b_matrix, h);
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_b_dst_vec{b_dst_vec, h, noinit};
accessor<cmplx, 1, access::mode::read_write, access::target::local>
acc_local(range<1>{B}, h);
h.parallel_for(nd_range<1>{{N}, {B}}, [=](nd_item<1> it) {
const size_t r = it.get_global_id(0);
const size_t l = it.get_local_id(0);
cmplx sum = cmplx(0., 0.);
for (size_t k = 0; k < N; k += B) {
acc_local[l] = acc_b_src_vec[k + l];
it.barrier();
for (uint k_ = 0; k_ < B; k_++) {
sum += acc_local[k_] * acc_b_matrix[r * N + (k + k_)];
}
it.barrier();
}
acc_b_dst_vec[r] = sum;
});
});
return evt;
}
int64_t compute_dft_v0(queue &q, const double *src_vec, cmplx *const dst_vec) {
cmplx *matrix = (cmplx *)malloc(sizeof(cmplx) * N * N);
memset(matrix, 0, sizeof(cmplx) * N * N);
std::chrono::_V2::steady_clock::time_point start =
std::chrono::steady_clock::now();
auto evt = compute_dft_matrix(q, matrix);
evt.wait();
evt = perform_multiplication(q, src_vec, matrix, dst_vec);
evt.wait();
std::chrono::_V2::steady_clock::time_point end =
std::chrono::steady_clock::now();
free(matrix);
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
}
int64_t compute_dft_v1(queue &q, const double *src_vec, cmplx *const dst_vec) {
cmplx *matrix = (cmplx *)malloc(sizeof(cmplx) * N * N);
memset(matrix, 0, sizeof(cmplx) * N * N);
buffer<cmplx, 2> b_matrix{matrix, range<2>{N, N}};
buffer<double, 1> b_src{src_vec, range<1>{N}};
buffer<cmplx, 1> b_dst{dst_vec, range<1>{N}};
std::chrono::_V2::steady_clock::time_point start =
std::chrono::steady_clock::now();
q.submit([&](handler &h) {
accessor<cmplx, 2, access::mode::read_write, access::target::global_buffer>
acc_b_matrix{b_matrix, h, noinit};
accessor<double, 1, access::mode::read, access::target::global_buffer>
acc_b_src{b_src, h};
h.parallel_for(
nd_range<2>{range<2>{N, N}, range<2>{B, 1}}, [=](nd_item<2> it) {
sycl::ONEAPI::sub_group sg = it.get_sub_group();
const uint r = it.get_global_id(0);
const uint c = it.get_global_id(1);
cmplx w = cmplx(std::cos(THETA), std::sin(THETA));
acc_b_matrix[r][c] = std::pow(w, r * c);
acc_b_matrix[r][c] *= sycl::ONEAPI::broadcast(sg, acc_b_src[c]);
});
});
auto evt = q.submit([&](handler &h) {
accessor<cmplx, 2, access::mode::read, access::target::global_buffer>
acc_b_matrix{b_matrix, h};
accessor<cmplx, 1, access::mode::read_write, access::target::global_buffer>
acc_b_dst{b_dst, h};
h.parallel_for(nd_range<1>{range<1>{N}, range<1>{B}}, [=](nd_item<1> it) {
const size_t r = it.get_global_id(0);
cmplx sum = cmplx{0., 0.};
for (uint i = 0; i < N; i++) {
sum += acc_b_matrix[r][i];
}
acc_b_dst[r] = sum;
});
});
evt.wait();
std::chrono::_V2::steady_clock::time_point end =
std::chrono::steady_clock::now();
free(matrix);
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
}
void fill(double *const data) {
for (uint i = 0; i < N; i++) {
*(data + i) = i + 1;
}
}
int main(int argc, char **argv) {
device d{default_selector{}};
queue q{d};
std::cout << "running on " << d.get_info<info::device::name>() << std::endl;
double *src_vec = (double *)malloc(sizeof(double) * N);
cmplx *dst_vec = (cmplx *)malloc(sizeof(cmplx) * N);
fill(src_vec);
memset(dst_vec, 0, sizeof(cmplx) * N);
int64_t ts = compute_dft_v0(q, src_vec, dst_vec);
std::cout << "dft v0 ✅, in " << ts << " ms" << std::endl;
memset(dst_vec, 0, sizeof(cmplx) * N);
ts = compute_dft_v0(q, src_vec, dst_vec);
std::cout << "dft v1 ✅, in " << ts << " ms" << std::endl;
free(src_vec);
free(dst_vec);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment