itzmeanjan/fft.cpp

## fft.cpp
#include <CL/sycl.hpp>
#include <chrono>
#include <complex>
#include <iostream>

using namespace sycl;
typedef std::complex<double> cmplx;

constexpr uint N = 1024;
constexpr uint B = 32;
constexpr double PI = 3.141592653589793;
constexpr double THETA_FFT = (-2.0 * PI) / (double)N;
constexpr double THETA_INV_FFT = (2.0 * PI) / (double)N;

uint32_t binary_rev(uint num) {
  uint32_t repr = 0;

  for (int i = 0; i < std::log2(N); i++) {
    repr = repr * 10 + ((num >> i) & 1);
  }

  return repr;
}

uint decimal(uint32_t bin) {
  uint dec = 0;

  uint pos = 0;
  while (bin > 0) {
    dec += (uint)std::pow(2, pos) * (bin % 10);
    pos++;
    bin /= 10;
  }

  return dec;
}

int64_t fft(queue &q, const double *src, cmplx *const dst) {
  cmplx *tmp_dst = (cmplx *)malloc(sizeof(cmplx) * N);
  memset(tmp_dst, 0, sizeof(cmplx) * N);

  buffer<double, 1> b_src{src, range<1>{N}};
  buffer<cmplx, 1> b_tmp_dst{tmp_dst, range<1>{N}};
  buffer<cmplx, 1> b_dst{dst, range<1>{N}};

  std::chrono::_V2::steady_clock::time_point start =
      std::chrono::steady_clock::now();

  q.submit([&](handler &h) {
    accessor<double, 1, access::mode::read, access::target::global_buffer>
        acc_src{b_src, h};
    accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
        acc_dst{b_dst, h};
    accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
        acc_tmp_dst{b_tmp_dst, h};

    h.parallel_for<class stepOneFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
      const size_t idx = it.get_global_id(0);
      double real = acc_src[idx];

      acc_dst[idx] = cmplx(real, 0.);
      acc_tmp_dst[idx] = cmplx(real, 0.);
    });
  });

  for (int i = (int)std::log2(N) - 1; i >= 0; i--) {
    q.submit([&](handler &h) {
      accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
          acc_dst{b_dst, h};
      accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
          acc_tmp_dst{b_tmp_dst, h};

      h.parallel_for<class stepTwoFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
        const uint k = it.get_global_id(0);
        const uint p = (uint)std::pow(2, i);
        const uint q = N / p;
        const cmplx w = cmplx(std::cos(THETA_FFT), std::sin(THETA_FFT));
        const cmplx z = std::pow(w, p);

        if (k % p == k % (2 * p)) {
          uint pow_z = decimal(binary_rev(k)) % q;

          acc_tmp_dst[k] = acc_dst[k] + acc_dst[k + p] * std::pow(z, pow_z);
          acc_tmp_dst[k + p] = acc_dst[k] - acc_dst[k + p] * std::pow(z, pow_z);
        }
      });
    });

    q.submit([&](handler &h) {
      accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
          acc_dst{b_dst, h};
      accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
          acc_tmp_dst{b_tmp_dst, h};

      h.copy(acc_tmp_dst, acc_dst);
    });
  }

  auto evt = q.submit([&](handler &h) {
    accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
        acc_dst{b_dst, h};
    accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
        acc_tmp_dst{b_tmp_dst, h};

    h.parallel_for<class stepThreeFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
      const uint k = it.get_global_id(0);
      acc_dst[k] = acc_tmp_dst[decimal(binary_rev(k))];
    });
  });
  evt.wait();

  std::chrono::_V2::steady_clock::time_point end =
      std::chrono::steady_clock::now();

  std::free(tmp_dst);
  return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
      .count();
}

int64_t inv_fft(queue &q, const cmplx *src, cmplx *const dst) {
  cmplx *tmp_dst = (cmplx *)malloc(sizeof(cmplx) * N);
  memset(tmp_dst, 0, sizeof(cmplx) * N);

  buffer<cmplx, 1> b_src{src, range<1>{N}};
  buffer<cmplx, 1> b_tmp_dst{tmp_dst, range<1>{N}};
  buffer<cmplx, 1> b_dst{dst, range<1>{N}};

  std::chrono::_V2::steady_clock::time_point start =
      std::chrono::steady_clock::now();

  q.submit([&](handler &h) {
    accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
        acc_src{b_src, h};
    accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
        acc_dst{b_dst, h};
    accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
        acc_tmp_dst{b_tmp_dst, h};

    h.parallel_for<class stepOneInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
      const size_t idx = it.get_global_id(0);

      acc_dst[idx] = acc_src[idx];
      acc_tmp_dst[idx] = acc_src[idx];
    });
  });

  for (int i = (int)std::log2(N) - 1; i >= 0; i--) {
    q.submit([&](handler &h) {
      accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
          acc_dst{b_dst, h};
      accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
          acc_tmp_dst{b_tmp_dst, h};

      h.parallel_for<class stepTwoInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
        const uint k = it.get_global_id(0);
        const uint p = (uint)std::pow(2, i);
        const uint q = N / p;
        const cmplx w = cmplx(std::cos(THETA_INV_FFT), std::sin(THETA_INV_FFT));
        const cmplx z = std::pow(w, p);

        if (k % p == k % (2 * p)) {
          uint pow_z = decimal(binary_rev(k)) % q;

          acc_tmp_dst[k] = acc_dst[k] + acc_dst[k + p] * std::pow(z, pow_z);
          acc_tmp_dst[k + p] = acc_dst[k] - acc_dst[k + p] * std::pow(z, pow_z);
        }
      });
    });

    auto evt = q.submit([&](handler &h) {
      accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
          acc_dst{b_dst, h};
      accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
          acc_tmp_dst{b_tmp_dst, h};

      h.copy(acc_tmp_dst, acc_dst);
    });
    evt.wait();
  }

  auto evt = q.submit([&](handler &h) {
    accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
        acc_dst{b_dst, h};
    accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
        acc_tmp_dst{b_tmp_dst, h};

    h.parallel_for<class stepThreeInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
      const uint k = it.get_global_id(0);
      acc_dst[k] = acc_tmp_dst[decimal(binary_rev(k))] / (double)N;
    });
  });
  evt.wait();

  std::chrono::_V2::steady_clock::time_point end =
      std::chrono::steady_clock::now();

  std::free(tmp_dst);
  return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
      .count();
}

void fill(double *const data) {
  for (uint i = 0; i < N; i++) {
    *(data + i) = i;
  }
}

int main(int argc, char **argv) {
  device d{default_selector{}};
  queue q{d};

  std::cout << "running on " << d.get_info<info::device::name>() << std::endl;

  double *src_vec = (double *)malloc(sizeof(double) * N);
  cmplx *dst_vec = (cmplx *)malloc(sizeof(cmplx) * N);
  cmplx *inv_dst_vec = (cmplx *)malloc(sizeof(cmplx) * N);

  fill(src_vec);
  memset(dst_vec, 0, sizeof(cmplx) * N);
  memset(inv_dst_vec, 0, sizeof(cmplx) * N);

  int64_t ts = fft(q, src_vec, dst_vec);
  std::cout << "fft ✅, in " << ts << " ms" << std::endl;

  ts = inv_fft(q, dst_vec, inv_dst_vec);
  std::cout << "inverse fft ✅, in " << ts << " ms" << std::endl;

  for (uint i = 0; i < N; i++) {
    assert(src_vec[i] == std::round(inv_dst_vec[i].real()));
  }

  std::cout << "asserted !" << std::endl;

  std::free(src_vec);
  std::free(dst_vec);
  std::free(inv_dst_vec);
  return 0;
}
	#include <CL/sycl.hpp>
	#include <chrono>
	#include <complex>
	#include <iostream>

	using namespace sycl;
	typedef std::complex<double> cmplx;

	constexpr uint N = 1024;
	constexpr uint B = 32;
	constexpr double PI = 3.141592653589793;
	constexpr double THETA_FFT = (-2.0 * PI) / (double)N;
	constexpr double THETA_INV_FFT = (2.0 * PI) / (double)N;

	uint32_t binary_rev(uint num) {
	uint32_t repr = 0;

	for (int i = 0; i < std::log2(N); i++) {
	repr = repr * 10 + ((num >> i) & 1);
	}

	return repr;
	}

	uint decimal(uint32_t bin) {
	uint dec = 0;

	uint pos = 0;
	while (bin > 0) {
	dec += (uint)std::pow(2, pos) * (bin % 10);
	pos++;
	bin /= 10;
	}

	return dec;
	}

	int64_t fft(queue &q, const double src, cmplx const dst) {
	cmplx tmp_dst = (cmplx )malloc(sizeof(cmplx) * N);
	memset(tmp_dst, 0, sizeof(cmplx) * N);

	buffer<double, 1> b_src{src, range<1>{N}};
	buffer<cmplx, 1> b_tmp_dst{tmp_dst, range<1>{N}};
	buffer<cmplx, 1> b_dst{dst, range<1>{N}};

	std::chrono::_V2::steady_clock::time_point start =
	std::chrono::steady_clock::now();

	q.submit([&](handler &h) {
	accessor<double, 1, access::mode::read, access::target::global_buffer>
	acc_src{b_src, h};
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_dst{b_dst, h};
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_tmp_dst{b_tmp_dst, h};

	h.parallel_for<class stepOneFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
	const size_t idx = it.get_global_id(0);
	double real = acc_src[idx];

	acc_dst[idx] = cmplx(real, 0.);
	acc_tmp_dst[idx] = cmplx(real, 0.);
	});
	});

	for (int i = (int)std::log2(N) - 1; i >= 0; i--) {
	q.submit([&](handler &h) {
	accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
	acc_dst{b_dst, h};
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_tmp_dst{b_tmp_dst, h};

	h.parallel_for<class stepTwoFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
	const uint k = it.get_global_id(0);
	const uint p = (uint)std::pow(2, i);
	const uint q = N / p;
	const cmplx w = cmplx(std::cos(THETA_FFT), std::sin(THETA_FFT));
	const cmplx z = std::pow(w, p);

	if (k % p == k % (2 * p)) {
	uint pow_z = decimal(binary_rev(k)) % q;

	acc_tmp_dst[k] = acc_dst[k] + acc_dst[k + p] * std::pow(z, pow_z);
	acc_tmp_dst[k + p] = acc_dst[k] - acc_dst[k + p] * std::pow(z, pow_z);
	}
	});
	});

	q.submit([&](handler &h) {
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_dst{b_dst, h};
	accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
	acc_tmp_dst{b_tmp_dst, h};

	h.copy(acc_tmp_dst, acc_dst);
	});
	}

	auto evt = q.submit([&](handler &h) {
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_dst{b_dst, h};
	accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
	acc_tmp_dst{b_tmp_dst, h};

	h.parallel_for<class stepThreeFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
	const uint k = it.get_global_id(0);
	acc_dst[k] = acc_tmp_dst[decimal(binary_rev(k))];
	});
	});
	evt.wait();

	std::chrono::_V2::steady_clock::time_point end =
	std::chrono::steady_clock::now();

	std::free(tmp_dst);
	return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
	.count();
	}

	int64_t inv_fft(queue &q, const cmplx src, cmplx const dst) {
	cmplx tmp_dst = (cmplx )malloc(sizeof(cmplx) * N);
	memset(tmp_dst, 0, sizeof(cmplx) * N);

	buffer<cmplx, 1> b_src{src, range<1>{N}};
	buffer<cmplx, 1> b_tmp_dst{tmp_dst, range<1>{N}};
	buffer<cmplx, 1> b_dst{dst, range<1>{N}};

	std::chrono::_V2::steady_clock::time_point start =
	std::chrono::steady_clock::now();

	q.submit([&](handler &h) {
	accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
	acc_src{b_src, h};
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_dst{b_dst, h};
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_tmp_dst{b_tmp_dst, h};

	h.parallel_for<class stepOneInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
	const size_t idx = it.get_global_id(0);

	acc_dst[idx] = acc_src[idx];
	acc_tmp_dst[idx] = acc_src[idx];
	});
	});

	for (int i = (int)std::log2(N) - 1; i >= 0; i--) {
	q.submit([&](handler &h) {
	accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
	acc_dst{b_dst, h};
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_tmp_dst{b_tmp_dst, h};

	h.parallel_for<class stepTwoInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
	const uint k = it.get_global_id(0);
	const uint p = (uint)std::pow(2, i);
	const uint q = N / p;
	const cmplx w = cmplx(std::cos(THETA_INV_FFT), std::sin(THETA_INV_FFT));
	const cmplx z = std::pow(w, p);

	if (k % p == k % (2 * p)) {
	uint pow_z = decimal(binary_rev(k)) % q;

	acc_tmp_dst[k] = acc_dst[k] + acc_dst[k + p] * std::pow(z, pow_z);
	acc_tmp_dst[k + p] = acc_dst[k] - acc_dst[k + p] * std::pow(z, pow_z);
	}
	});
	});

	auto evt = q.submit([&](handler &h) {
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_dst{b_dst, h};
	accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
	acc_tmp_dst{b_tmp_dst, h};

	h.copy(acc_tmp_dst, acc_dst);
	});
	evt.wait();
	}

	auto evt = q.submit([&](handler &h) {
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_dst{b_dst, h};
	accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
	acc_tmp_dst{b_tmp_dst, h};

	h.parallel_for<class stepThreeInvFFT>(nd_range<1>{N, B}, [=](nd_item<1> it) {
	const uint k = it.get_global_id(0);
	acc_dst[k] = acc_tmp_dst[decimal(binary_rev(k))] / (double)N;
	});
	});
	evt.wait();

	std::chrono::_V2::steady_clock::time_point end =
	std::chrono::steady_clock::now();

	std::free(tmp_dst);
	return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
	.count();
	}

	void fill(double *const data) {
	for (uint i = 0; i < N; i++) {
	*(data + i) = i;
	}
	}

	int main(int argc, char **argv) {
	device d{default_selector{}};
	queue q{d};

	std::cout << "running on " << d.get_info<info::device::name>() << std::endl;

	double src_vec = (double )malloc(sizeof(double) * N);
	cmplx dst_vec = (cmplx )malloc(sizeof(cmplx) * N);
	cmplx inv_dst_vec = (cmplx )malloc(sizeof(cmplx) * N);

	fill(src_vec);
	memset(dst_vec, 0, sizeof(cmplx) * N);
	memset(inv_dst_vec, 0, sizeof(cmplx) * N);

	int64_t ts = fft(q, src_vec, dst_vec);
	std::cout << "fft ✅, in " << ts << " ms" << std::endl;

	ts = inv_fft(q, dst_vec, inv_dst_vec);
	std::cout << "inverse fft ✅, in " << ts << " ms" << std::endl;

	for (uint i = 0; i < N; i++) {
	assert(src_vec[i] == std::round(inv_dst_vec[i].real()));
	}

	std::cout << "asserted !" << std::endl;

	std::free(src_vec);
	std::free(dst_vec);
	std::free(inv_dst_vec);
	return 0;
	}