itzmeanjan/dft.cpp

## dft.cpp
#include <CL/sycl.hpp>
#include <chrono>
#include <complex>
#include <iostream>

using namespace sycl;
typedef std::complex<double> cmplx;

constexpr uint N = 1 << 10;
constexpr uint B = 1 << 5;
constexpr double PI = 3.141592653589793;
constexpr double THETA = (-2.0 * PI) / (double)N;

sycl::event compute_dft_matrix(queue &q, cmplx *const matrix) {
  buffer<cmplx, 2> b_matrix{matrix, range<2>{N, N}};

  auto evt = q.submit([&](handler &h) {
    accessor<cmplx, 2, access::mode::write, access::target::global_buffer>
        acc_b_matrix{b_matrix, h, noinit};

    h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{1, B}},
                   [=](nd_item<2> it) {
                     const uint r = it.get_global_id(0);
                     const uint c = it.get_global_id(1);
                     cmplx w = cmplx(std::cos(THETA), std::sin(THETA));

                     acc_b_matrix[r][c] = std::pow(w, r * c);
                   });
  });
  return evt;
}

sycl::event perform_multiplication(queue &q, const double *src_vec,
                                   const cmplx *matrix, cmplx *const dst_vec) {
  buffer<double, 1> b_src_vec{src_vec, range<1>{N}};
  buffer<cmplx, 1> b_matrix(matrix, range<1>{N * N});
  buffer<cmplx, 1> b_dst_vec{dst_vec, range<1>{N}};

  auto evt = q.submit([&](handler &h) {
    accessor<double, 1, access::mode::read, access::target::global_buffer>
        acc_b_src_vec{b_src_vec, h};
    accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
        acc_b_matrix(b_matrix, h);
    accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
        acc_b_dst_vec{b_dst_vec, h, noinit};
    accessor<cmplx, 1, access::mode::read_write, access::target::local>
        acc_local(range<1>{B}, h);

    h.parallel_for(nd_range<1>{{N}, {B}}, [=](nd_item<1> it) {
      const size_t r = it.get_global_id(0);
      const size_t l = it.get_local_id(0);

      cmplx sum = cmplx(0., 0.);
      for (size_t k = 0; k < N; k += B) {
        acc_local[l] = acc_b_src_vec[k + l];
        it.barrier();

        for (uint k_ = 0; k_ < B; k_++) {
          sum += acc_local[k_] * acc_b_matrix[r * N + (k + k_)];
        }
        it.barrier();
      }

      acc_b_dst_vec[r] = sum;
    });
  });
  return evt;
}

int64_t compute_dft_v0(queue &q, const double *src_vec, cmplx *const dst_vec) {
  cmplx *matrix = (cmplx *)malloc(sizeof(cmplx) * N * N);
  memset(matrix, 0, sizeof(cmplx) * N * N);

  std::chrono::_V2::steady_clock::time_point start =
      std::chrono::steady_clock::now();
  auto evt = compute_dft_matrix(q, matrix);
  evt.wait();
  evt = perform_multiplication(q, src_vec, matrix, dst_vec);
  evt.wait();
  std::chrono::_V2::steady_clock::time_point end =
      std::chrono::steady_clock::now();

  free(matrix);
  return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
      .count();
}

int64_t compute_dft_v1(queue &q, const double *src_vec, cmplx *const dst_vec) {
  cmplx *matrix = (cmplx *)malloc(sizeof(cmplx) * N * N);
  memset(matrix, 0, sizeof(cmplx) * N * N);

  buffer<cmplx, 2> b_matrix{matrix, range<2>{N, N}};
  buffer<double, 1> b_src{src_vec, range<1>{N}};
  buffer<cmplx, 1> b_dst{dst_vec, range<1>{N}};

  std::chrono::_V2::steady_clock::time_point start =
      std::chrono::steady_clock::now();

  q.submit([&](handler &h) {
    accessor<cmplx, 2, access::mode::read_write, access::target::global_buffer>
        acc_b_matrix{b_matrix, h, noinit};
    accessor<double, 1, access::mode::read, access::target::global_buffer>
        acc_b_src{b_src, h};

    h.parallel_for(
        nd_range<2>{range<2>{N, N}, range<2>{B, 1}}, [=](nd_item<2> it) {
          sycl::ONEAPI::sub_group sg = it.get_sub_group();
          const uint r = it.get_global_id(0);
          const uint c = it.get_global_id(1);
          cmplx w = cmplx(std::cos(THETA), std::sin(THETA));

          acc_b_matrix[r][c] = std::pow(w, r * c);
          acc_b_matrix[r][c] *= sycl::ONEAPI::broadcast(sg, acc_b_src[c]);
        });
  });

  auto evt = q.submit([&](handler &h) {
    accessor<cmplx, 2, access::mode::read, access::target::global_buffer>
        acc_b_matrix{b_matrix, h};
    accessor<cmplx, 1, access::mode::read_write, access::target::global_buffer>
        acc_b_dst{b_dst, h};

    h.parallel_for(nd_range<1>{range<1>{N}, range<1>{B}}, [=](nd_item<1> it) {
      const size_t r = it.get_global_id(0);

      cmplx sum = cmplx{0., 0.};
      for (uint i = 0; i < N; i++) {
        sum += acc_b_matrix[r][i];
      }
      acc_b_dst[r] = sum;
    });
  });

  evt.wait();

  std::chrono::_V2::steady_clock::time_point end =
      std::chrono::steady_clock::now();

  free(matrix);
  return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
      .count();
}

void fill(double *const data) {
  for (uint i = 0; i < N; i++) {
    *(data + i) = i + 1;
  }
}

int main(int argc, char **argv) {
  device d{default_selector{}};
  queue q{d};
  std::cout << "running on " << d.get_info<info::device::name>() << std::endl;

  double *src_vec = (double *)malloc(sizeof(double) * N);
  cmplx *dst_vec = (cmplx *)malloc(sizeof(cmplx) * N);

  fill(src_vec);
  memset(dst_vec, 0, sizeof(cmplx) * N);

  int64_t ts = compute_dft_v0(q, src_vec, dst_vec);
  std::cout << "dft v0 ✅, in " << ts << " ms" << std::endl;

  memset(dst_vec, 0, sizeof(cmplx) * N);

  ts = compute_dft_v0(q, src_vec, dst_vec);
  std::cout << "dft v1 ✅, in " << ts << " ms" << std::endl;

  free(src_vec);
  free(dst_vec);
  return 0;
}
	#include <CL/sycl.hpp>
	#include <chrono>
	#include <complex>
	#include <iostream>

	using namespace sycl;
	typedef std::complex<double> cmplx;

	constexpr uint N = 1 << 10;
	constexpr uint B = 1 << 5;
	constexpr double PI = 3.141592653589793;
	constexpr double THETA = (-2.0 * PI) / (double)N;

	sycl::event compute_dft_matrix(queue &q, cmplx *const matrix) {
	buffer<cmplx, 2> b_matrix{matrix, range<2>{N, N}};

	auto evt = q.submit([&](handler &h) {
	accessor<cmplx, 2, access::mode::write, access::target::global_buffer>
	acc_b_matrix{b_matrix, h, noinit};

	h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{1, B}},
	[=](nd_item<2> it) {
	const uint r = it.get_global_id(0);
	const uint c = it.get_global_id(1);
	cmplx w = cmplx(std::cos(THETA), std::sin(THETA));

	acc_b_matrix[r][c] = std::pow(w, r * c);
	});
	});
	return evt;
	}

	sycl::event perform_multiplication(queue &q, const double *src_vec,
	const cmplx matrix, cmplx const dst_vec) {
	buffer<double, 1> b_src_vec{src_vec, range<1>{N}};
	buffer<cmplx, 1> b_matrix(matrix, range<1>{N * N});
	buffer<cmplx, 1> b_dst_vec{dst_vec, range<1>{N}};

	auto evt = q.submit([&](handler &h) {
	accessor<double, 1, access::mode::read, access::target::global_buffer>
	acc_b_src_vec{b_src_vec, h};
	accessor<cmplx, 1, access::mode::read, access::target::global_buffer>
	acc_b_matrix(b_matrix, h);
	accessor<cmplx, 1, access::mode::write, access::target::global_buffer>
	acc_b_dst_vec{b_dst_vec, h, noinit};
	accessor<cmplx, 1, access::mode::read_write, access::target::local>
	acc_local(range<1>{B}, h);

	h.parallel_for(nd_range<1>{{N}, {B}}, [=](nd_item<1> it) {
	const size_t r = it.get_global_id(0);
	const size_t l = it.get_local_id(0);

	cmplx sum = cmplx(0., 0.);
	for (size_t k = 0; k < N; k += B) {
	acc_local[l] = acc_b_src_vec[k + l];
	it.barrier();

	for (uint k_ = 0; k_ < B; k_++) {
	sum += acc_local[k_] * acc_b_matrix[r * N + (k + k_)];
	}
	it.barrier();
	}

	acc_b_dst_vec[r] = sum;
	});
	});
	return evt;
	}

	int64_t compute_dft_v0(queue &q, const double src_vec, cmplx const dst_vec) {
	cmplx matrix = (cmplx )malloc(sizeof(cmplx) * N * N);
	memset(matrix, 0, sizeof(cmplx) * N * N);

	std::chrono::_V2::steady_clock::time_point start =
	std::chrono::steady_clock::now();
	auto evt = compute_dft_matrix(q, matrix);
	evt.wait();
	evt = perform_multiplication(q, src_vec, matrix, dst_vec);
	evt.wait();
	std::chrono::_V2::steady_clock::time_point end =
	std::chrono::steady_clock::now();

	free(matrix);
	return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
	.count();
	}

	int64_t compute_dft_v1(queue &q, const double src_vec, cmplx const dst_vec) {
	cmplx matrix = (cmplx )malloc(sizeof(cmplx) * N * N);
	memset(matrix, 0, sizeof(cmplx) * N * N);

	buffer<cmplx, 2> b_matrix{matrix, range<2>{N, N}};
	buffer<double, 1> b_src{src_vec, range<1>{N}};
	buffer<cmplx, 1> b_dst{dst_vec, range<1>{N}};

	std::chrono::_V2::steady_clock::time_point start =
	std::chrono::steady_clock::now();

	q.submit([&](handler &h) {
	accessor<cmplx, 2, access::mode::read_write, access::target::global_buffer>
	acc_b_matrix{b_matrix, h, noinit};
	accessor<double, 1, access::mode::read, access::target::global_buffer>
	acc_b_src{b_src, h};

	h.parallel_for(
	nd_range<2>{range<2>{N, N}, range<2>{B, 1}}, [=](nd_item<2> it) {
	sycl::ONEAPI::sub_group sg = it.get_sub_group();
	const uint r = it.get_global_id(0);
	const uint c = it.get_global_id(1);
	cmplx w = cmplx(std::cos(THETA), std::sin(THETA));

	acc_b_matrix[r][c] = std::pow(w, r * c);
	acc_b_matrix[r][c] *= sycl::ONEAPI::broadcast(sg, acc_b_src[c]);
	});
	});

	auto evt = q.submit([&](handler &h) {
	accessor<cmplx, 2, access::mode::read, access::target::global_buffer>
	acc_b_matrix{b_matrix, h};
	accessor<cmplx, 1, access::mode::read_write, access::target::global_buffer>
	acc_b_dst{b_dst, h};

	h.parallel_for(nd_range<1>{range<1>{N}, range<1>{B}}, [=](nd_item<1> it) {
	const size_t r = it.get_global_id(0);

	cmplx sum = cmplx{0., 0.};
	for (uint i = 0; i < N; i++) {
	sum += acc_b_matrix[r][i];
	}
	acc_b_dst[r] = sum;
	});
	});

	evt.wait();

	std::chrono::_V2::steady_clock::time_point end =
	std::chrono::steady_clock::now();

	free(matrix);
	return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
	.count();
	}

	void fill(double *const data) {
	for (uint i = 0; i < N; i++) {
	*(data + i) = i + 1;
	}
	}

	int main(int argc, char **argv) {
	device d{default_selector{}};
	queue q{d};
	std::cout << "running on " << d.get_info<info::device::name>() << std::endl;

	double src_vec = (double )malloc(sizeof(double) * N);
	cmplx dst_vec = (cmplx )malloc(sizeof(cmplx) * N);

	fill(src_vec);
	memset(dst_vec, 0, sizeof(cmplx) * N);

	int64_t ts = compute_dft_v0(q, src_vec, dst_vec);
	std::cout << "dft v0 ✅, in " << ts << " ms" << std::endl;

	memset(dst_vec, 0, sizeof(cmplx) * N);

	ts = compute_dft_v0(q, src_vec, dst_vec);
	std::cout << "dft v1 ✅, in " << ts << " ms" << std::endl;

	free(src_vec);
	free(dst_vec);
	return 0;
	}