This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <CL/sycl.hpp> | |
#include <chrono> | |
#include <complex> | |
#include <iostream> | |
using namespace sycl; | |
typedef std::complex<double> cmplx; | |
constexpr uint N = 1024; | |
constexpr uint B = 32; | |
constexpr double PI = 3.141592653589793; | |
constexpr double THETA_FFT = (-2.0 * PI) / (double)N; | |
constexpr double THETA_INV_FFT = (2.0 * PI) / (double)N; | |
uint32_t binary_rev(uint num) { | |
uint32_t repr = 0; | |
for (int i = 0; i < std::log2(N); i++) { | |
repr = repr * 10 + ((num >> i) & 1); | |
} | |
return repr; | |
} | |
uint decimal(uint32_t bin) { | |
uint dec = 0; | |
uint pos = 0; | |
while (bin > 0) { | |
dec += (uint)std::pow(2, pos) * (bin % 10); | |
pos++; | |
bin /= 10; | |
} | |
return dec; | |
} | |
int64_t fft(queue &q, const double *src, cmplx *const dst) { | |
cmplx *tmp_dst = (cmplx *)malloc(sizeof(cmplx) * N); | |
memset(tmp_dst, 0, sizeof(cmplx) * N); | |
buffer<double, 1> b_src{src, range<1>{N}}; | |
buffer<cmplx, 1> b_tmp_dst{tmp_dst, range<1>{N}}; | |
buffer<cmplx, 1> b_dst{dst, range<1>{N}}; | |
std::chrono::_V2::steady_clock::time_point start = | |
std::chrono::steady_clock::now(); | |
q.submit([&](handler &h) { | |
accessor<double, 1, access::mode::read, access::target::global_buffer> | |
acc_src{b_src, h}; | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_dst{b_dst, h}; | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_tmp_dst{b_tmp_dst, h}; | |
h.parallel_for<class stepOneFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) { | |
const size_t idx = it.get_global_id(0); | |
double real = acc_src[idx]; | |
acc_dst[idx] = cmplx(real, 0.); | |
acc_tmp_dst[idx] = cmplx(real, 0.); | |
}); | |
}); | |
for (int i = (int)std::log2(N) - 1; i >= 0; i--) { | |
q.submit([&](handler &h) { | |
accessor<cmplx, 1, access::mode::read, access::target::global_buffer> | |
acc_dst{b_dst, h}; | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_tmp_dst{b_tmp_dst, h}; | |
h.parallel_for<class stepTwoFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) { | |
const uint k = it.get_global_id(0); | |
const uint p = (uint)std::pow(2, i); | |
const uint q = N / p; | |
const cmplx w = cmplx(std::cos(THETA_FFT), std::sin(THETA_FFT)); | |
const cmplx z = std::pow(w, p); | |
if (k % p == k % (2 * p)) { | |
uint pow_z = decimal(binary_rev(k)) % q; | |
acc_tmp_dst[k] = acc_dst[k] + acc_dst[k + p] * std::pow(z, pow_z); | |
acc_tmp_dst[k + p] = acc_dst[k] - acc_dst[k + p] * std::pow(z, pow_z); | |
} | |
}); | |
}); | |
q.submit([&](handler &h) { | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_dst{b_dst, h}; | |
accessor<cmplx, 1, access::mode::read, access::target::global_buffer> | |
acc_tmp_dst{b_tmp_dst, h}; | |
h.copy(acc_tmp_dst, acc_dst); | |
}); | |
} | |
auto evt = q.submit([&](handler &h) { | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_dst{b_dst, h}; | |
accessor<cmplx, 1, access::mode::read, access::target::global_buffer> | |
acc_tmp_dst{b_tmp_dst, h}; | |
h.parallel_for<class stepThreeFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) { | |
const uint k = it.get_global_id(0); | |
acc_dst[k] = acc_tmp_dst[decimal(binary_rev(k))]; | |
}); | |
}); | |
evt.wait(); | |
std::chrono::_V2::steady_clock::time_point end = | |
std::chrono::steady_clock::now(); | |
std::free(tmp_dst); | |
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start) | |
.count(); | |
} | |
int64_t inv_fft(queue &q, const cmplx *src, cmplx *const dst) { | |
cmplx *tmp_dst = (cmplx *)malloc(sizeof(cmplx) * N); | |
memset(tmp_dst, 0, sizeof(cmplx) * N); | |
buffer<cmplx, 1> b_src{src, range<1>{N}}; | |
buffer<cmplx, 1> b_tmp_dst{tmp_dst, range<1>{N}}; | |
buffer<cmplx, 1> b_dst{dst, range<1>{N}}; | |
std::chrono::_V2::steady_clock::time_point start = | |
std::chrono::steady_clock::now(); | |
q.submit([&](handler &h) { | |
accessor<cmplx, 1, access::mode::read, access::target::global_buffer> | |
acc_src{b_src, h}; | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_dst{b_dst, h}; | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_tmp_dst{b_tmp_dst, h}; | |
h.parallel_for<class stepOneInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) { | |
const size_t idx = it.get_global_id(0); | |
acc_dst[idx] = acc_src[idx]; | |
acc_tmp_dst[idx] = acc_src[idx]; | |
}); | |
}); | |
for (int i = (int)std::log2(N) - 1; i >= 0; i--) { | |
q.submit([&](handler &h) { | |
accessor<cmplx, 1, access::mode::read, access::target::global_buffer> | |
acc_dst{b_dst, h}; | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_tmp_dst{b_tmp_dst, h}; | |
h.parallel_for<class stepTwoInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) { | |
const uint k = it.get_global_id(0); | |
const uint p = (uint)std::pow(2, i); | |
const uint q = N / p; | |
const cmplx w = cmplx(std::cos(THETA_INV_FFT), std::sin(THETA_INV_FFT)); | |
const cmplx z = std::pow(w, p); | |
if (k % p == k % (2 * p)) { | |
uint pow_z = decimal(binary_rev(k)) % q; | |
acc_tmp_dst[k] = acc_dst[k] + acc_dst[k + p] * std::pow(z, pow_z); | |
acc_tmp_dst[k + p] = acc_dst[k] - acc_dst[k + p] * std::pow(z, pow_z); | |
} | |
}); | |
}); | |
auto evt = q.submit([&](handler &h) { | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_dst{b_dst, h}; | |
accessor<cmplx, 1, access::mode::read, access::target::global_buffer> | |
acc_tmp_dst{b_tmp_dst, h}; | |
h.copy(acc_tmp_dst, acc_dst); | |
}); | |
evt.wait(); | |
} | |
auto evt = q.submit([&](handler &h) { | |
accessor<cmplx, 1, access::mode::write, access::target::global_buffer> | |
acc_dst{b_dst, h}; | |
accessor<cmplx, 1, access::mode::read, access::target::global_buffer> | |
acc_tmp_dst{b_tmp_dst, h}; | |
h.parallel_for<class stepThreeInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) { | |
const uint k = it.get_global_id(0); | |
acc_dst[k] = acc_tmp_dst[decimal(binary_rev(k))] / (double)N; | |
}); | |
}); | |
evt.wait(); | |
std::chrono::_V2::steady_clock::time_point end = | |
std::chrono::steady_clock::now(); | |
std::free(tmp_dst); | |
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start) | |
.count(); | |
} | |
void fill(double *const data) { | |
for (uint i = 0; i < N; i++) { | |
*(data + i) = i; | |
} | |
} | |
int main(int argc, char **argv) { | |
device d{default_selector{}}; | |
queue q{d}; | |
std::cout << "running on " << d.get_info<info::device::name>() << std::endl; | |
double *src_vec = (double *)malloc(sizeof(double) * N); | |
cmplx *dst_vec = (cmplx *)malloc(sizeof(cmplx) * N); | |
cmplx *inv_dst_vec = (cmplx *)malloc(sizeof(cmplx) * N); | |
fill(src_vec); | |
memset(dst_vec, 0, sizeof(cmplx) * N); | |
memset(inv_dst_vec, 0, sizeof(cmplx) * N); | |
int64_t ts = fft(q, src_vec, dst_vec); | |
std::cout << "fft β , in " << ts << " ms" << std::endl; | |
ts = inv_fft(q, dst_vec, inv_dst_vec); | |
std::cout << "inverse fft β , in " << ts << " ms" << std::endl; | |
for (uint i = 0; i < N; i++) { | |
assert(src_vec[i] == std::round(inv_dst_vec[i].real())); | |
} | |
std::cout << "asserted !" << std::endl; | |
std::free(src_vec); | |
std::free(dst_vec); | |
std::free(inv_dst_vec); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Introduction
This piece of code implements 1D parallel (inv)FFT from scratch in SYCL DPC++. It requires O(N) processors i.e. global worker size, where N = length(src_vector). After computing FFT, it performs inverse FFT & asserts result obtained.
Usage
For compiling & running this piece of code, make sure you've built & installed DPC++ toolchain. You may want to follow: https://intel.github.io/llvm-docs/GetStartedGuide.html#create-dpc-workspace
Now you should haveπ
clang++
compiler installed, which can compile SYCL programs likeYou may also download Intel oneAPI base toolkit, from https://software.intel.com/content/www/us/en/develop/documentation/installation-guide-for-intel-oneapi-toolkits-linux/top.html, which should give you
dpcpp
compiler.This time you can compile & run with
dpcpp