itzmeanjan/lu_decomposition.cpp

## lu_decomposition.cpp
#include <CL/sycl.hpp>
#include <chrono>
#include <iostream>
#include <random>

using namespace sycl;

constexpr uint N = 1 << 10;
constexpr uint B = 1 << 5;

sycl::event identity_matrix(queue &q, buffer<float, 2> &matrix) {
  auto evt = q.submit([&](handler &h) {
    accessor<float, 2, access::mode::write, access::target::global_buffer>
        acc_matrix{matrix, h, noinit};

    h.parallel_for<class kernelIdentity>(
        nd_range<2>{range<2>{N, N}, range<2>{1, B}}, [=](nd_item<2> it) {
          const size_t r = it.get_global_id(0);
          const size_t c = it.get_global_id(1);

          if (r == c) {
            acc_matrix[r][c] = 1.f;
          } else {
            acc_matrix[r][c] = 0.f;
          }
        });
  });
  return evt;
}

int64_t lu_decomposition(queue &q, const float *matrix, float *const ldiag,
                         float *const udiag) {
  float *matrix_ = (float *)malloc(sizeof(float) * N * N);
  memcpy(matrix_, matrix, sizeof(float) * N * N);

  buffer<float, 2> b_matrix{matrix_, range<2>{N, N}};
  buffer<float, 2> b_ldiag{ldiag, range<2>{N, N}};
  buffer<float, 2> b_udiag{udiag, range<2>{N, N}};

  std::chrono::_V2::steady_clock::time_point start =
      std::chrono::steady_clock::now();

  auto evt = identity_matrix(q, b_ldiag);
  evt.wait();

  for (size_t i = 0; i < N - 1; i++) {
    q.submit([&](handler &h) {
      accessor<float, 2, access::mode::read, access::target::global_buffer>
          acc_matrix{b_matrix, h};
      accessor<float, 2, access::mode::write, access::target::global_buffer>
          acc_ldiag{b_ldiag, h};

      h.parallel_for<class kernelLDiagonal>(
          nd_range<1>{N, B}, [=](nd_item<1> it) {
            const size_t r = it.get_global_id(0);

            if (r > i) {
              acc_ldiag[r][i] = acc_matrix[r][i] / acc_matrix[i][i];
            }
          });
    });

    q.submit([&](handler &h) {
      accessor<float, 2, access::mode::read, access::target::global_buffer>
          acc_matrix{b_matrix, h};
      accessor<float, 2, access::mode::write, access::target::global_buffer>
          acc_udiag{b_udiag, h};

      h.parallel_for<class kernelUDiagonal>(
          nd_range<2>{range<2>{N, N}, range<2>{1, B}}, [=](nd_item<2> it) {
            const size_t r = it.get_global_id(0);
            const size_t c = it.get_global_id(1);

            if (r > i && c >= i) {
              acc_udiag[r][c] =
                  acc_matrix[r][c] -
                  (acc_matrix[i][c] * (acc_matrix[r][i] / acc_matrix[i][i]));
            } else {
              acc_udiag[r][c] = acc_matrix[r][c];
            }
          });
    });

    auto evt = q.submit([&](handler &h) {
      accessor<float, 2, access::mode::write, access::target::global_buffer>
          acc_matrix{b_matrix, h};
      accessor<float, 2, access::mode::read, access::target::global_buffer>
          acc_udiag{b_udiag, h};

      h.copy(acc_udiag, acc_matrix);
    });
    evt.wait();
  }

  std::chrono::_V2::steady_clock::time_point end =
      std::chrono::steady_clock::now();

  std::free(matrix_);
  return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
      .count();
}

// when running with example matrix, consider
// setting N = 4, B= 4
//
// below is a 4x4 matrix
void example_matrix(float *const matrix) {
  matrix[0 * N + 0] = 2.f;
  matrix[0 * N + 1] = 4.f;
  matrix[0 * N + 2] = 3.f;
  matrix[0 * N + 3] = 5.f;

  matrix[1 * N + 0] = -4.f;
  matrix[1 * N + 1] = -7.f;
  matrix[1 * N + 2] = -5.f;
  matrix[1 * N + 3] = -8.f;

  matrix[2 * N + 0] = 6.f;
  matrix[2 * N + 1] = 8.f;
  matrix[2 * N + 2] = 2.f;
  matrix[2 * N + 3] = 9.f;

  matrix[3 * N + 0] = 4.f;
  matrix[3 * N + 1] = 9.f;
  matrix[3 * N + 2] = -2.f;
  matrix[3 * N + 3] = 14.f;
}

void random_matrix(float *const adj_matrix) {
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<float> dis(0.f, 1.f);

  for (uint i = 0; i < N; i++) {
    for (uint j = 0; j < N; j++) {
      adj_matrix[i * N + j] = dis(gen);
    }
  }
}

int main(int argc, char **argv) {
  device d{default_selector{}};
  queue q{d};
  std::cout << "running on " << d.get_info<info::device::name>() << std::endl;

  float *matrix = (float *)malloc(sizeof(float) * N * N);
  float *ldiag = (float *)malloc(sizeof(float) * N * N);
  float *udiag = (float *)malloc(sizeof(float) * N * N);

  memset(matrix, 0, sizeof(float) * N * N);
  memset(ldiag, 0, sizeof(float) * N * N);
  memset(udiag, 0, sizeof(float) * N * N);
  random_matrix(matrix);

  int64_t ts = lu_decomposition(q, matrix, ldiag, udiag);
  std::cout << "LU decomposition: " << ts << " ms" << std::endl;

  float max_dev = 0.f;
  for (uint i = 0; i < N; i++) {
    for (uint j = 0; j < N; j++) {
      float sum = 0.f;

      for (uint k = 0; k < N; k++) {
        sum += ldiag[i * N + k] * udiag[k * N + j];
      }

      float dev = std::abs(matrix[i * N + j] - sum);
      max_dev = std::max(dev, max_dev);
    }
  }

  std::cout << "max deviation: " << max_dev << std::endl;

  std::free(matrix);
  std::free(ldiag);
  std::free(udiag);

  return 0;
}
	#include <CL/sycl.hpp>
	#include <chrono>
	#include <iostream>
	#include <random>

	using namespace sycl;

	constexpr uint N = 1 << 10;
	constexpr uint B = 1 << 5;

	sycl::event identity_matrix(queue &q, buffer<float, 2> &matrix) {
	auto evt = q.submit([&](handler &h) {
	accessor<float, 2, access::mode::write, access::target::global_buffer>
	acc_matrix{matrix, h, noinit};

	h.parallel_for<class kernelIdentity>(
	nd_range<2>{range<2>{N, N}, range<2>{1, B}}, [=](nd_item<2> it) {
	const size_t r = it.get_global_id(0);
	const size_t c = it.get_global_id(1);

	if (r == c) {
	acc_matrix[r][c] = 1.f;
	} else {
	acc_matrix[r][c] = 0.f;
	}
	});
	});
	return evt;
	}

	int64_t lu_decomposition(queue &q, const float matrix, float const ldiag,
	float *const udiag) {
	float matrix_ = (float )malloc(sizeof(float) * N * N);
	memcpy(matrix_, matrix, sizeof(float) * N * N);

	buffer<float, 2> b_matrix{matrix_, range<2>{N, N}};
	buffer<float, 2> b_ldiag{ldiag, range<2>{N, N}};
	buffer<float, 2> b_udiag{udiag, range<2>{N, N}};

	std::chrono::_V2::steady_clock::time_point start =
	std::chrono::steady_clock::now();

	auto evt = identity_matrix(q, b_ldiag);
	evt.wait();

	for (size_t i = 0; i < N - 1; i++) {
	q.submit([&](handler &h) {
	accessor<float, 2, access::mode::read, access::target::global_buffer>
	acc_matrix{b_matrix, h};
	accessor<float, 2, access::mode::write, access::target::global_buffer>
	acc_ldiag{b_ldiag, h};

	h.parallel_for<class kernelLDiagonal>(
	nd_range<1>{N, B}, [=](nd_item<1> it) {
	const size_t r = it.get_global_id(0);

	if (r > i) {
	acc_ldiag[r][i] = acc_matrix[r][i] / acc_matrix[i][i];
	}
	});
	});

	q.submit([&](handler &h) {
	accessor<float, 2, access::mode::read, access::target::global_buffer>
	acc_matrix{b_matrix, h};
	accessor<float, 2, access::mode::write, access::target::global_buffer>
	acc_udiag{b_udiag, h};

	h.parallel_for<class kernelUDiagonal>(
	nd_range<2>{range<2>{N, N}, range<2>{1, B}}, [=](nd_item<2> it) {
	const size_t r = it.get_global_id(0);
	const size_t c = it.get_global_id(1);

	if (r > i && c >= i) {
	acc_udiag[r][c] =
	acc_matrix[r][c] -
	(acc_matrix[i][c] * (acc_matrix[r][i] / acc_matrix[i][i]));
	} else {
	acc_udiag[r][c] = acc_matrix[r][c];
	}
	});
	});

	auto evt = q.submit([&](handler &h) {
	accessor<float, 2, access::mode::write, access::target::global_buffer>
	acc_matrix{b_matrix, h};
	accessor<float, 2, access::mode::read, access::target::global_buffer>
	acc_udiag{b_udiag, h};

	h.copy(acc_udiag, acc_matrix);
	});
	evt.wait();
	}

	std::chrono::_V2::steady_clock::time_point end =
	std::chrono::steady_clock::now();

	std::free(matrix_);
	return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
	.count();
	}

	// when running with example matrix, consider
	// setting N = 4, B= 4
	//
	// below is a 4x4 matrix
	void example_matrix(float *const matrix) {
	matrix[0 * N + 0] = 2.f;
	matrix[0 * N + 1] = 4.f;
	matrix[0 * N + 2] = 3.f;
	matrix[0 * N + 3] = 5.f;

	matrix[1 * N + 0] = -4.f;
	matrix[1 * N + 1] = -7.f;
	matrix[1 * N + 2] = -5.f;
	matrix[1 * N + 3] = -8.f;

	matrix[2 * N + 0] = 6.f;
	matrix[2 * N + 1] = 8.f;
	matrix[2 * N + 2] = 2.f;
	matrix[2 * N + 3] = 9.f;

	matrix[3 * N + 0] = 4.f;
	matrix[3 * N + 1] = 9.f;
	matrix[3 * N + 2] = -2.f;
	matrix[3 * N + 3] = 14.f;
	}

	void random_matrix(float *const adj_matrix) {
	std::random_device rd;
	std::mt19937 gen(rd());
	std::uniform_real_distribution<float> dis(0.f, 1.f);

	for (uint i = 0; i < N; i++) {
	for (uint j = 0; j < N; j++) {
	adj_matrix[i * N + j] = dis(gen);
	}
	}
	}

	int main(int argc, char **argv) {
	device d{default_selector{}};
	queue q{d};
	std::cout << "running on " << d.get_info<info::device::name>() << std::endl;

	float matrix = (float )malloc(sizeof(float) * N * N);
	float ldiag = (float )malloc(sizeof(float) * N * N);
	float udiag = (float )malloc(sizeof(float) * N * N);

	memset(matrix, 0, sizeof(float) * N * N);
	memset(ldiag, 0, sizeof(float) * N * N);
	memset(udiag, 0, sizeof(float) * N * N);
	random_matrix(matrix);

	int64_t ts = lu_decomposition(q, matrix, ldiag, udiag);
	std::cout << "LU decomposition: " << ts << " ms" << std::endl;

	float max_dev = 0.f;
	for (uint i = 0; i < N; i++) {
	for (uint j = 0; j < N; j++) {
	float sum = 0.f;

	for (uint k = 0; k < N; k++) {
	sum += ldiag[i * N + k] * udiag[k * N + j];
	}

	float dev = std::abs(matrix[i * N + j] - sum);
	max_dev = std::max(dev, max_dev);
	}
	}

	std::cout << "max deviation: " << max_dev << std::endl;

	std::free(matrix);
	std::free(ldiag);
	std::free(udiag);

	return 0;
	}