Skip to content

Instantly share code, notes, and snippets.

@itzmeanjan
Last active Oct 1, 2021
Embed
What would you like to do?
🀞 Parallel (inv)FFT on GPGPU, using SYCL DPC++ πŸš€
#include <CL/sycl.hpp>
#include <chrono>
#include <complex>
#include <iostream>
using namespace sycl;
typedef std::complex<double> cmplx;
constexpr uint N = 1024;
constexpr uint B = 32;
constexpr double PI = 3.141592653589793;
constexpr double THETA_FFT = (-2.0 * PI) / (double)N;
constexpr double THETA_INV_FFT = (2.0 * PI) / (double)N;
uint32_t binary_rev(uint num) {
uint32_t repr = 0;
for (int i = 0; i < std::log2(N); i++) {
repr = repr * 10 + ((num >> i) & 1);
}
return repr;
}
uint decimal(uint32_t bin) {
uint dec = 0;
uint pos = 0;
while (bin > 0) {
dec += (uint)std::pow(2, pos) * (bin % 10);
pos++;
bin /= 10;
}
return dec;
}
int64_t fft(queue &q, const double *src, cmplx *const dst) {
cmplx *tmp_dst = (cmplx *)malloc(sizeof(cmplx) * N);
memset(tmp_dst, 0, sizeof(cmplx) * N);
buffer<double, 1> b_src{src, range<1>{N}};
buffer<cmplx, 1> b_tmp_dst{tmp_dst, range<1>{N}};
buffer<cmplx, 1> b_dst{dst, range<1>{N}};
std::chrono::_V2::steady_clock::time_point start =
std::chrono::steady_clock::now();
q.submit([&](handler &h) {
accessor<double, 1, access::mode::read, access::target::global_buffer>
acc_src{b_src, h};
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_dst{b_dst, h};
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_tmp_dst{b_tmp_dst, h};
h.parallel_for<class stepOneFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
const size_t idx = it.get_global_id(0);
double real = acc_src[idx];
acc_dst[idx] = cmplx(real, 0.);
acc_tmp_dst[idx] = cmplx(real, 0.);
});
});
for (int i = (int)std::log2(N) - 1; i >= 0; i--) {
q.submit([&](handler &h) {
accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
acc_dst{b_dst, h};
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_tmp_dst{b_tmp_dst, h};
h.parallel_for<class stepTwoFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
const uint k = it.get_global_id(0);
const uint p = (uint)std::pow(2, i);
const uint q = N / p;
const cmplx w = cmplx(std::cos(THETA_FFT), std::sin(THETA_FFT));
const cmplx z = std::pow(w, p);
if (k % p == k % (2 * p)) {
uint pow_z = decimal(binary_rev(k)) % q;
acc_tmp_dst[k] = acc_dst[k] + acc_dst[k + p] * std::pow(z, pow_z);
acc_tmp_dst[k + p] = acc_dst[k] - acc_dst[k + p] * std::pow(z, pow_z);
}
});
});
q.submit([&](handler &h) {
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_dst{b_dst, h};
accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
acc_tmp_dst{b_tmp_dst, h};
h.copy(acc_tmp_dst, acc_dst);
});
}
auto evt = q.submit([&](handler &h) {
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_dst{b_dst, h};
accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
acc_tmp_dst{b_tmp_dst, h};
h.parallel_for<class stepThreeFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
const uint k = it.get_global_id(0);
acc_dst[k] = acc_tmp_dst[decimal(binary_rev(k))];
});
});
evt.wait();
std::chrono::_V2::steady_clock::time_point end =
std::chrono::steady_clock::now();
std::free(tmp_dst);
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
}
int64_t inv_fft(queue &q, const cmplx *src, cmplx *const dst) {
cmplx *tmp_dst = (cmplx *)malloc(sizeof(cmplx) * N);
memset(tmp_dst, 0, sizeof(cmplx) * N);
buffer<cmplx, 1> b_src{src, range<1>{N}};
buffer<cmplx, 1> b_tmp_dst{tmp_dst, range<1>{N}};
buffer<cmplx, 1> b_dst{dst, range<1>{N}};
std::chrono::_V2::steady_clock::time_point start =
std::chrono::steady_clock::now();
q.submit([&](handler &h) {
accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
acc_src{b_src, h};
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_dst{b_dst, h};
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_tmp_dst{b_tmp_dst, h};
h.parallel_for<class stepOneInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
const size_t idx = it.get_global_id(0);
acc_dst[idx] = acc_src[idx];
acc_tmp_dst[idx] = acc_src[idx];
});
});
for (int i = (int)std::log2(N) - 1; i >= 0; i--) {
q.submit([&](handler &h) {
accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
acc_dst{b_dst, h};
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_tmp_dst{b_tmp_dst, h};
h.parallel_for<class stepTwoInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
const uint k = it.get_global_id(0);
const uint p = (uint)std::pow(2, i);
const uint q = N / p;
const cmplx w = cmplx(std::cos(THETA_INV_FFT), std::sin(THETA_INV_FFT));
const cmplx z = std::pow(w, p);
if (k % p == k % (2 * p)) {
uint pow_z = decimal(binary_rev(k)) % q;
acc_tmp_dst[k] = acc_dst[k] + acc_dst[k + p] * std::pow(z, pow_z);
acc_tmp_dst[k + p] = acc_dst[k] - acc_dst[k + p] * std::pow(z, pow_z);
}
});
});
auto evt = q.submit([&](handler &h) {
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_dst{b_dst, h};
accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
acc_tmp_dst{b_tmp_dst, h};
h.copy(acc_tmp_dst, acc_dst);
});
evt.wait();
}
auto evt = q.submit([&](handler &h) {
accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
acc_dst{b_dst, h};
accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
acc_tmp_dst{b_tmp_dst, h};
h.parallel_for<class stepThreeInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
const uint k = it.get_global_id(0);
acc_dst[k] = acc_tmp_dst[decimal(binary_rev(k))] / (double)N;
});
});
evt.wait();
std::chrono::_V2::steady_clock::time_point end =
std::chrono::steady_clock::now();
std::free(tmp_dst);
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
}
void fill(double *const data) {
for (uint i = 0; i < N; i++) {
*(data + i) = i;
}
}
int main(int argc, char **argv) {
device d{default_selector{}};
queue q{d};
std::cout << "running on " << d.get_info<info::device::name>() << std::endl;
double *src_vec = (double *)malloc(sizeof(double) * N);
cmplx *dst_vec = (cmplx *)malloc(sizeof(cmplx) * N);
cmplx *inv_dst_vec = (cmplx *)malloc(sizeof(cmplx) * N);
fill(src_vec);
memset(dst_vec, 0, sizeof(cmplx) * N);
memset(inv_dst_vec, 0, sizeof(cmplx) * N);
int64_t ts = fft(q, src_vec, dst_vec);
std::cout << "fft βœ…, in " << ts << " ms" << std::endl;
ts = inv_fft(q, dst_vec, inv_dst_vec);
std::cout << "inverse fft βœ…, in " << ts << " ms" << std::endl;
for (uint i = 0; i < N; i++) {
assert(src_vec[i] == std::round(inv_dst_vec[i].real()));
}
std::cout << "asserted !" << std::endl;
std::free(src_vec);
std::free(dst_vec);
std::free(inv_dst_vec);
return 0;
}
@itzmeanjan
Copy link
Author

itzmeanjan commented Oct 1, 2021

Introduction

This piece of code implements 1D parallel (inv)FFT from scratch in SYCL DPC++. It requires O(N) processors i.e. global worker size, where N = length(src_vector). After computing FFT, it performs inverse FFT & asserts result obtained.

Usage

For compiling & running this piece of code, make sure you've built & installed DPC++ toolchain. You may want to follow: https://intel.github.io/llvm-docs/GetStartedGuide.html#create-dpc-workspace

Now you should have clang++ compiler installed, which can compile SYCL programs like πŸ‘‡

$ clang++ --version
Intel(R) oneAPI DPC++/C++ Compiler 2021.3.0 (2021.3.0.20210619)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/intel/oneapi/compiler/2021.3.0/linux/bin

$ clang++ -fsycl fft.cpp && ./a.out
running on Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
fft βœ…, in 1556 ms
inverse fft βœ…, in 9 ms
asserted !

# you may see something similar πŸ‘†

You may also download Intel oneAPI base toolkit, from https://software.intel.com/content/www/us/en/develop/documentation/installation-guide-for-intel-oneapi-toolkits-linux/top.html, which should give you dpcpp compiler.

This time you can compile & run with dpcpp

$ dpcpp --version
Intel(R) oneAPI DPC++/C++ Compiler 2021.3.0 (2021.3.0.20210619)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/intel/oneapi/compiler/2021.3.0/linux/bin

$ dpcpp -fsycl fft.cpp && ./a.out
running on Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
fft βœ…, in 492 ms
inverse fft βœ…, in 7 ms
a.out: 30.cpp:230: int main(int, char **): Assertion `src_vec[i] == std::round(inv_dst_vec[i].real())' failed.
Aborted (core dumped)

# though assertion fails, if B > 4 πŸ™‚
# check https://gist.github.com/itzmeanjan/94cadd86e99cf6fe6dfe4f6439b7ce28#file-fft-cpp-L11

# you may consider setting B <= 4, recompile & run
# see assertion doesn't fail anymore ! πŸ€”

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment