yohm/build_run.sh

## build_run.sh
#!/bin/bash -eux

mpicc pdgesv_example.cpp -lscalapack
mpirun -np 6 ./a.out 9 2 2 3

## edgesv_example2.cpp
#include <cstdlib>
#include <iostream>
#include <mpi.h>
#include <random>
#include <vector>
#include <chrono>
#include "icecream-cpp/icecream.hpp"

// Declaration of Fortran subroutines
extern "C" {
void sl_init_(int *icontext, int *nprow, int *npcolumn);
// SL_INIT initializes an NPROW x NPCOL process grid using a row-major ordering
// of the processes. This routine retrieves a default system context which will
// include all available processes. (out) ictxt, (in) nprow, npcolumn

void blacs_gridinfo_(int *icontext, int *nprow, int *npcolumn, int *myrow,
                     int *mycolumn);
// (in) icontext: BLACS context
// (out) nprow, npcolumn: the numbers of rows and columns in this process grid
// (out) myrow, mycolumn: the process grid row- and column-index

void blacs_exit_(int *cont);
// (in) continue: if 0, all the resources are released. If nonzero, MPI
// resources are not released.

void blacs_gridexit_(int *icontext);
// (in) icontext: BLACS context

void descinit_(int *desc, int *m, int *n, int *mb, int *nb, int *irsrc,
               int *icsrc, int *icontext, int *lld, int *info);
// (out) descriptor for the global matrix. `desc` must be an array of int of
//   length 9. int[9]
// (in) m, n: rows and columns of the matrix (in) mb, nb: row,
//   column block sizes
// (in) irsrc, icsrc: the process row (column) over which the
// first row of the global matrix is distributed.
// (in) icontext: BLACS context
// (in) lld: leading dimension of the local array
// (out) info: 0 => completed successfully

void dgesd2d_(int *icontext, int *m, int *n, double *A, int *lda, int *r_dest,
              int *c_dest);
// Takes a general rectangular matrix and sends it to the destination process.
// (in) icontext: BLACS context
// (in) m,n: matrix sizes
// (in) A: matrix
// (in) lda: leading dimension (m)
// (in) r_dest, c_dest: the process corrdinate of the process to send the
// message to

void dgerv2d_(int *icontext, int *m, int *n, double *A, int *lda, int *row_src,
              int *col_src);
// Receives a message from the process into the general rectangular matrix.
// (in) icontext: BLACS context
// (in) m,n,lda: sizes of the matrix
// (out) A: matrix
// (in) row_src, col_src: the process coordinate of the source of the message

void pdgesv_(int *n, int *nrhs, double *A, int *ia, int *ja, int desc_a[9],
             int *ipvt, double *B, int *ib, int *jb, int desc_b[9], int *info);
// These subroutines solve the following systems of equations for multiple
// right-hand sides: AX = B
// (in) n: order of the submatrix = the number of rows of B
// (in) nrhs: the number of columns of B
// (in/out) A: the local part of the global general matrix A.
// (in) ia, ja: the row and the column indices of the
//   global matrix A, identifying the first row and column of the submatrix A.
// (in) desc_a: descriptor of A matrix
// (out) ipvt: the local part of the global vector ipvt, containing the pivot
// indices.
// (in/out) B: the local part of the global general matrix B,
//   containing the right-hand sides of the system.
// (in) ib, jb: the row and the column indices of the global matrix B,
//   identifying the first row and column of the submatrix B.
// (in) desc_b: descriptor of B matrix (out) info: error code
}

int main(int argc, char *argv[]) {
  icecream::ic.disable();
  // Get command line arguments
  if (argc != 5) {
    exit(1);
  }

  // Number of columns in the global matrix A
  int N = std::strtol(argv[1], nullptr, 10);
  int M = N; // Number of rows in the global matrix A
  // block size for the matrix A
  int NB = std::strtol(argv[2], nullptr, 10);
  int MB = NB;
  // Number of rows and columns in the process grid
  int NPROW = std::strtol(argv[3], nullptr, 10);
  int NPCOL = std::strtol(argv[4], nullptr, 10);

  // Initialize the process grid
  int ICTXT, MYROW, MYCOL;
  sl_init_(&ICTXT, &NPROW, &NPCOL);
  blacs_gridinfo_(&ICTXT, &NPROW, &NPCOL, &MYROW, &MYCOL);

  int my_rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  icecream::ic.prefix(
      "ic (", [my_rank] { return my_rank; }, "): ");
  IC(ICTXT, NPROW, NPCOL, MYROW, MYCOL);

  // If I'm not in the process grid, go to the end of the program
  if (MYROW == -1) {
    int blacs_exitcode = 0;
    blacs_exit_(&blacs_exitcode);
    exit(0);
  }

    // Do not ever do like this in real world. Use high performance random array
    // generators

  //*****
  // Distribute matrices to the process grid
  //*****

  int CSRC = 0, RSRC = 0; // root process coordinate
  int NRHS = 1;           // Number of columns in the global solution matrix B

  // size of SUB_A
  int SUB_A_COLS = (N / (NB * NPCOL)) * NB + std::min(N % (NB * NPCOL), NB);
  int SUB_A_ROWS = (M / (MB * NPROW)) * MB + std::min(M % (MB * NPROW), MB);

  // size of SUB_B
  int SUB_B_ROWS = SUB_A_ROWS;
  int SUB_B_COLS = 1;

  IC(RSRC, CSRC, SUB_A_COLS, SUB_A_ROWS, SUB_A_ROWS, SUB_B_ROWS);

  // Initialize array descriptors
  int DESCA[9], DESCB[9];
  int INFO;
  descinit_(DESCA, &M, &N, &MB, &NB, &RSRC, &CSRC, &ICTXT, &SUB_A_ROWS, &INFO);
  descinit_(DESCB, &N, &NRHS, &MB, &NRHS, &RSRC, &CSRC, &ICTXT, &SUB_B_ROWS,
            &INFO);

  std::vector<int> IPIV(SUB_A_ROWS + NB);

  // COLUMN-MAJOR matrices
  std::vector<double> SUB_A(SUB_A_ROWS * SUB_A_COLS, 0.0);
  std::vector<double> SUB_B(SUB_B_ROWS * SUB_B_COLS, 0.0);

  // convert submatrix index (i,j) at process (p_row, p_col) into global
  // coordinate (I,J)
  auto to_global_coordinate = [SUB_A_ROWS, SUB_A_COLS, MB, NB, NPROW,
                               NPCOL](int i, int j, int p_row,
                                      int p_col) -> std::pair<int, int> {
    // block coordinate (bi, bj)
    int bi = i / MB;
    int bj = j / NB;
    // local coordinate inside the block
    int ii = i % MB;
    int jj = j % NB;
    // calculate global coordinate
    int I = bi * (MB * NPROW) + p_row * MB + ii;
    int J = bj * (NB * NPCOL) + p_col * NB + jj;
    return std::make_pair(I, J);
    // set value to local matrix
  };

  std::seed_seq seq({123456789ull, static_cast<unsigned long long>(my_rank)});
  std::mt19937_64 rnd(seq);
  std::uniform_real_distribution<double> uni;

  // generate SUB_A
  for (size_t i = 0; i < SUB_A_ROWS; i++) {
    for (size_t j = 0; j < SUB_A_COLS; j++) {
      auto IJ = to_global_coordinate(i, j, MYROW, MYCOL);
      int I = IJ.first, J = IJ.second;
      if (I < M && J < N) {
        SUB_A[i + j * SUB_A_ROWS] = uni(rnd);
      }
    }
  }
  // generate SUB_B
  for (size_t i = 0; i < SUB_B_ROWS; i++) {
    int j = 0;
    auto IJ = to_global_coordinate(i, j, MYROW, MYCOL);
    int I = IJ.first, J = IJ.second;
    if (I < M && J < N) {
      SUB_B[i] = uni(rnd);
    }
  }

  if (my_rank == 1) {
    IC(SUB_A, SUB_B);
  }

  //*****
  // Call the SCALAPACK routine
  //*****
  int IA = 1;
  int JA = 1;
  int IB = 1;
  int JB = 1;
  int LDA = 1;
  auto t0 = std::chrono::system_clock::now();
  pdgesv_(&N, &NRHS, SUB_A.data(), &IA, &JA, DESCA, IPIV.data(), SUB_B.data(),
          &IB, &JB, DESCB, &INFO);
  auto t1 = std::chrono::system_clock::now();
  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count();
  if (my_rank == 0) std::cerr << "ELAPSED: " << elapsed << " ms" << std::endl;

  std::vector<double> X(M, 0.0);

  auto copy_SUB_B_to_X = [&X, &SUB_B, SUB_B_ROWS, SUB_B_COLS, M, NRHS,
                          &to_global_coordinate](int p_row, int p_col) {
    for (int i = 0; i < SUB_B_ROWS; i++) {
      for (int j = 0; j < SUB_B_COLS; j++) {
        auto IJ = to_global_coordinate(i, j, p_row, p_col);
        // set value to local matrix. Note SUB_A is column major
        int I = IJ.first, J = IJ.second;
        if (I >= M || J >= NRHS)
          continue;
        X[I * NRHS + J] = SUB_B[i + j * SUB_B_ROWS];
      }
    }
  };

  // collect SUB_B
  copy_SUB_B_to_X(MYROW, MYCOL);
  std::vector<double> XX(M, 0.0);
  MPI_Allreduce(X.data(), XX.data(), X.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

  if ((MYROW == 0) && (MYCOL == 0)) {
    std::cout << "Matrix X:\n";
    for (int i = 0; i < M; i++) {
      if (i < 32)
        std::cout << XX[i] << ' ';
    }
    std::cout << std::endl;
  }

  // Release the process grid
  blacs_gridexit_(&ICTXT);
  int blacs_exitcode = 0;
  blacs_exit_(&blacs_exitcode);

  return 0;
}

## pdgesv_example.cpp
#include "icecream-cpp/icecream.hpp"
#include <cstdlib>
#include <iostream>
#include <mpi.h>
#include <random>
#include <vector>

// Declaration of Fortran subroutines
extern "C" {
void sl_init_(int *icontext, int *nprow, int *npcolumn);
// SL_INIT initializes an NPROW x NPCOL process grid using a row-major ordering
// of the processes. This routine retrieves a default system context which will
// include all available processes. (out) ictxt, (in) nprow, npcolumn

void blacs_gridinfo_(int *icontext, int *nprow, int *npcolumn, int *myrow,
                     int *mycolumn);
// (in) icontext: BLACS context
// (out) nprow, npcolumn: the numbers of rows and columns in this process grid
// (out) myrow, mycolumn: the process grid row- and column-index

void blacs_exit_(int *cont);
// (in) continue: if 0, all the resources are released. If nonzero, MPI
// resources are not released.

void blacs_gridexit_(int *icontext);
// (in) icontext: BLACS context

void descinit_(int *desc, int *m, int *n, int *mb, int *nb, int *irsrc,
               int *icsrc, int *icontext, int *lld, int *info);
// (out) descriptor for the global matrix. `desc` must be an array of int of
//   length 9. int[9]
// (in) m, n: rows and columns of the matrix (in) mb, nb: row,
//   column block sizes
// (in) irsrc, icsrc: the process row (column) over which the
// first row of the global matrix is distributed.
// (in) icontext: BLACS context
// (in) lld: leading dimension of the local array
// (out) info: 0 => completed successfully

void dgesd2d_(int *icontext, int *m, int *n, double *A, int *lda, int *r_dest,
              int *c_dest);
// Takes a general rectangular matrix and sends it to the destination process.
// (in) icontext: BLACS context
// (in) m,n: matrix sizes
// (in) A: matrix
// (in) lda: leading dimension (m)
// (in) r_dest, c_dest: the process corrdinate of the process to send the
// message to

void dgerv2d_(int *icontext, int *m, int *n, double *A, int *lda, int *row_src,
              int *col_src);
// Receives a message from the process into the general rectangular matrix.
// (in) icontext: BLACS context
// (in) m,n,lda: sizes of the matrix
// (out) A: matrix
// (in) row_src, col_src: the process coordinate of the source of the message

void pdgesv_(int *n, int *nrhs, double *A, int *ia, int *ja, int desc_a[9],
             int *ipvt, double *B, int *ib, int *jb, int desc_b[9], int *info);
// These subroutines solve the following systems of equations for multiple
// right-hand sides: AX = B
// (in) n: order of the submatrix = the number of rows of B
// (in) nrhs: the number of columns of B
// (in/out) A: the local part of the global general matrix A.
// (in) ia, ja: the row and the column indices of the
//   global matrix A, identifying the first row and column of the submatrix A.
// (in) desc_a: descriptor of A matrix
// (out) ipvt: the local part of the global vector ipvt, containing the pivot
// indices.
// (in/out) B: the local part of the global general matrix B,
//   containing the right-hand sides of the system.
// (in) ib, jb: the row and the column indices of the global matrix B,
//   identifying the first row and column of the submatrix B.
// (in) desc_b: descriptor of B matrix (out) info: error code
}

int main(int argc, char *argv[]) {
  icecream::ic.disable();
  // Get command line arguments
  if (argc != 5) {
    exit(1);
  }

  // Number of columns in the global matrix A
  int N = std::strtol(argv[1], nullptr, 10);
  int M = N; // Number of rows in the global matrix A
  // block size for the matrix A
  int NB = std::strtol(argv[2], nullptr, 10);
  int MB = NB;
  // Number of rows and columns in the process grid
  int NPROW = std::strtol(argv[3], nullptr, 10);
  int NPCOL = std::strtol(argv[4], nullptr, 10);

  // Initialize the process grid
  int ICTXT, MYROW, MYCOL;
  sl_init_(&ICTXT, &NPROW, &NPCOL);
  blacs_gridinfo_(&ICTXT, &NPROW, &NPCOL, &MYROW, &MYCOL);

  int my_rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  icecream::ic.prefix(
      "ic (", [my_rank] { return my_rank; }, "): ");
  IC(ICTXT, NPROW, NPCOL, MYROW, MYCOL);

  // If I'm not in the process grid, go to the end of the program
  if (MYROW == -1) {
    int blacs_exitcode = 0;
    blacs_exit_(&blacs_exitcode);
    exit(0);
  }

  // global matrix A and B
  std::vector<double> A;
  std::vector<double> B;
  // Generate matrices A and B
  if (MYROW == 0 && MYCOL == 0) {
    A.resize(M * N);
    B.resize(M);

    // Do not ever do like this in real world. Use high performance random array
    // generators
    std::mt19937_64 rnd(123456789ull);
    std::uniform_real_distribution<double> uni;

    std::cout << "Matrix A:\n";
    // A is a row-major matrix (as in C,C++) but SUB_A is
    // the column major matrix as in Fortran
    for (int i = 0; i < M; i++) {
      for (int j = 0; j < N; j++) {
        // ROW-MAJOR matrix
        A[i * N + j] = uni(rnd);
        if (i < 32 && j < 32)
          std::cout << A[i * N + j] << ' ';
      }
      if (i < 32)
        std::cout << std::endl;
    }

    std::cout << "Matrix B:\n";
    for (int i = 0; i < M; i++) {
      B[i] = uni(rnd);
      if (i < 32)
        std::cout << B[i] << ' ';
    }
    std::cout << std::endl;
  }

  //*****
  // Distribute matrices to the process grid
  //*****

  int CSRC = 0, RSRC = 0; // root process coordinate
  int NRHS = 1;           // Number of columns in the global solution matrix B

  // size of SUB_A
  int SUB_A_COLS = (N / (NB * NPCOL)) * NB + std::min(N % (NB * NPCOL), NB);
  int SUB_A_ROWS = (M / (MB * NPROW)) * MB + std::min(M % (MB * NPROW), MB);

  // size of SUB_B
  int SUB_B_ROWS = SUB_A_ROWS;
  int SUB_B_COLS = 1;

  IC(RSRC, CSRC, SUB_A_COLS, SUB_A_ROWS, SUB_A_ROWS, SUB_B_ROWS);

  // Initialize array descriptors
  int DESCA[9], DESCB[9];
  int INFO;
  descinit_(DESCA, &M, &N, &MB, &NB, &RSRC, &CSRC, &ICTXT, &SUB_A_ROWS, &INFO);
  descinit_(DESCB, &N, &NRHS, &MB, &NRHS, &RSRC, &CSRC, &ICTXT, &SUB_B_ROWS,
            &INFO);

  std::vector<int> IPIV(SUB_A_ROWS + NB);

  // COLUMN-MAJOR matrices
  std::vector<double> SUB_A(SUB_A_ROWS * SUB_A_COLS, 0.0);
  std::vector<double> SUB_B(SUB_B_ROWS * SUB_B_COLS, 0.0);

  // convert submatrix index (i,j) at process (p_row, p_col) into global
  // coordinate (I,J)
  auto to_global_coordinate = [SUB_A_ROWS, SUB_A_COLS, MB, NB, NPROW,
                               NPCOL](int i, int j, int p_row,
                                      int p_col) -> std::pair<int, int> {
    // block coordinate (bi, bj)
    int bi = i / MB;
    int bj = j / NB;
    // local coordinate inside the block
    int ii = i % MB;
    int jj = j % NB;
    // calculate global coordinate
    int I = bi * (MB * NPROW) + p_row * MB + ii;
    int J = bj * (NB * NPCOL) + p_col * NB + jj;
    return std::make_pair(I, J);
    // set value to local matrix
  };

  auto copy_A_to_SUB_A = [&A, &SUB_A, SUB_A_ROWS, SUB_A_COLS, M, N,
                          &to_global_coordinate](int p_row, int p_col) {
    for (int i = 0; i < SUB_A_ROWS; i++) {
      for (int j = 0; j < SUB_A_COLS; j++) {
        auto IJ = to_global_coordinate(i, j, p_row, p_col);
        // set value to local matrix. Note SUB_A is column major
        int I = IJ.first, J = IJ.second;
        if (I >= M || J >= N)
          continue;
        // IC(p_row, p_col, i, j, I, J);
        SUB_A[i + j * SUB_A_ROWS] = A[I * N + J];
      }
    }
  };
  auto copy_B_to_SUB_B = [&B, &SUB_B, SUB_B_ROWS, SUB_B_COLS, M, NRHS,
                          &to_global_coordinate](int p_row, int p_col) {
    for (int i = 0; i < SUB_B_ROWS; i++) {
      for (int j = 0; j < SUB_B_COLS; j++) {
        auto IJ = to_global_coordinate(i, j, p_row, p_col);
        // set value to local matrix. Note SUB_A is column major
        int I = IJ.first, J = IJ.second;
        if (I >= M || J >= NRHS)
          continue;
        SUB_B[i + j * SUB_B_ROWS] = B[I * NRHS + J];
      }
    }
  };
  auto copy_SUB_B_to_B = [&B, &SUB_B, SUB_B_ROWS, SUB_B_COLS, M, NRHS,
                          &to_global_coordinate](int p_row, int p_col) {
    for (int i = 0; i < SUB_B_ROWS; i++) {
      for (int j = 0; j < SUB_B_COLS; j++) {
        auto IJ = to_global_coordinate(i, j, p_row, p_col);
        // set value to local matrix. Note SUB_A is column major
        int I = IJ.first, J = IJ.second;
        if (I >= M || J >= NRHS)
          continue;
        B[I * NRHS + J] = SUB_B[i + j * SUB_B_ROWS];
      }
    }
  };

  // send SUB_A
  int LDA = 1; // The distance between two elements in matrix row
  if (MYROW == 0 && MYCOL == 0) {
    for (int p_row = 0; p_row < NPROW; p_row++) {
      for (int p_col = 0; p_col < NPCOL; p_col++) {
        if (p_row == 0 && p_col == 0)
          continue;
        copy_A_to_SUB_A(p_row, p_col);
        dgesd2d_(&ICTXT, &SUB_A_ROWS, &SUB_A_COLS, SUB_A.data(), &LDA, &p_row,
                 &p_col);
      }
    }
    copy_A_to_SUB_A(0, 0); // set local submatrix for rank 0
  } else {
    dgerv2d_(&ICTXT, &SUB_A_ROWS, &SUB_A_COLS, SUB_A.data(), &LDA, &RSRC,
             &CSRC);
  }
  // distribute SUB_B
  if (MYROW == 0 && MYCOL == 0) {
    for (int p_row = 1; p_row < NPROW; p_row++) {
      int p_col = 0;
      copy_B_to_SUB_B(p_row, p_col);
      dgesd2d_(&ICTXT, &SUB_B_ROWS, &SUB_B_COLS, SUB_B.data(), &LDA, &p_row,
               &p_col);
    }
    copy_B_to_SUB_B(0, 0);
  } else if (MYCOL == 0) {
    dgerv2d_(&ICTXT, &SUB_B_ROWS, &SUB_B_COLS, SUB_B.data(), &LDA, &RSRC,
             &CSRC);
  }

  if (my_rank == 1) {
    IC(SUB_A, SUB_B);
  }

  //*****
  // Call the SCALAPACK routine
  //*****
  int IA = 1;
  int JA = 1;
  int IB = 1;
  int JB = 1;
  pdgesv_(&N, &NRHS, SUB_A.data(), &IA, &JA, DESCA, IPIV.data(), SUB_B.data(),
          &IB, &JB, DESCB, &INFO);

  // collect SUB_B
  if (MYROW == 0 && MYCOL == 0) {
    copy_SUB_B_to_B(0, 0);
    for (int p_row = 1; p_row < NPROW; p_row++) {
      int p_col = 0;
      dgerv2d_(&ICTXT, &SUB_B_ROWS, &SUB_B_COLS, SUB_B.data(), &LDA, &p_row,
               &p_col);
      copy_SUB_B_to_B(p_row, p_col);
    }
  } else if (MYCOL == 0) {
    dgesd2d_(&ICTXT, &SUB_B_ROWS, &SUB_B_COLS, SUB_B.data(), &LDA, &RSRC,
             &CSRC);
  }

  if ((MYROW == 0) && (MYCOL == 0)) {
    std::cout << "Matrix X:\n";
    for (int i = 0; i < M; i++) {
      if (i < 32)
        std::cout << B[i] << ' ';
    }
    std::cout << std::endl;

    std::cout << "Matrix AX:\n";
    for (int i = 0; i < M; i++) {
      double ans = 0.0;
      if (i == 32)
        break;
      for (int j = 0; j < N; j++) {
        ans += A[i * N + j] * B[j];
      }
      std::cout << ans << ' ';
    }
    std::cout << std::endl;
  }

  // Release the process grid
  blacs_gridexit_(&ICTXT);
  int blacs_exitcode = 0;
  blacs_exit_(&blacs_exitcode);

  return 0;
}
	#!/bin/bash -eux

	mpicc pdgesv_example.cpp -lscalapack
	mpirun -np 6 ./a.out 9 2 2 3
	#include <cstdlib>
	#include <iostream>
	#include <mpi.h>
	#include <random>
	#include <vector>
	#include <chrono>
	#include "icecream-cpp/icecream.hpp"

	// Declaration of Fortran subroutines
	extern "C" {
	void sl_init_(int icontext, int nprow, int *npcolumn);
	// SL_INIT initializes an NPROW x NPCOL process grid using a row-major ordering
	// of the processes. This routine retrieves a default system context which will
	// include all available processes. (out) ictxt, (in) nprow, npcolumn

	void blacs_gridinfo_(int icontext, int nprow, int npcolumn, int myrow,
	int *mycolumn);
	// (in) icontext: BLACS context
	// (out) nprow, npcolumn: the numbers of rows and columns in this process grid
	// (out) myrow, mycolumn: the process grid row- and column-index

	void blacs_exit_(int *cont);
	// (in) continue: if 0, all the resources are released. If nonzero, MPI
	// resources are not released.

	void blacs_gridexit_(int *icontext);
	// (in) icontext: BLACS context

	void descinit_(int desc, int m, int n, int mb, int nb, int irsrc,
	int icsrc, int icontext, int lld, int info);
	// (out) descriptor for the global matrix. `desc` must be an array of int of
	// length 9. int[9]
	// (in) m, n: rows and columns of the matrix (in) mb, nb: row,
	// column block sizes
	// (in) irsrc, icsrc: the process row (column) over which the
	// first row of the global matrix is distributed.
	// (in) icontext: BLACS context
	// (in) lld: leading dimension of the local array
	// (out) info: 0 => completed successfully

	void dgesd2d_(int icontext, int m, int n, double A, int lda, int r_dest,
	int *c_dest);
	// Takes a general rectangular matrix and sends it to the destination process.
	// (in) icontext: BLACS context
	// (in) m,n: matrix sizes
	// (in) A: matrix
	// (in) lda: leading dimension (m)
	// (in) r_dest, c_dest: the process corrdinate of the process to send the
	// message to

	void dgerv2d_(int icontext, int m, int n, double A, int lda, int row_src,
	int *col_src);
	// Receives a message from the process into the general rectangular matrix.
	// (in) icontext: BLACS context
	// (in) m,n,lda: sizes of the matrix
	// (out) A: matrix
	// (in) row_src, col_src: the process coordinate of the source of the message

	void pdgesv_(int n, int nrhs, double A, int ia, int *ja, int desc_a[9],
	int ipvt, double B, int ib, int jb, int desc_b[9], int *info);
	// These subroutines solve the following systems of equations for multiple
	// right-hand sides: AX = B
	// (in) n: order of the submatrix = the number of rows of B
	// (in) nrhs: the number of columns of B
	// (in/out) A: the local part of the global general matrix A.
	// (in) ia, ja: the row and the column indices of the
	// global matrix A, identifying the first row and column of the submatrix A.
	// (in) desc_a: descriptor of A matrix
	// (out) ipvt: the local part of the global vector ipvt, containing the pivot
	// indices.
	// (in/out) B: the local part of the global general matrix B,
	// containing the right-hand sides of the system.
	// (in) ib, jb: the row and the column indices of the global matrix B,
	// identifying the first row and column of the submatrix B.
	// (in) desc_b: descriptor of B matrix (out) info: error code
	}

	int main(int argc, char *argv[]) {
	icecream::ic.disable();
	// Get command line arguments
	if (argc != 5) {
	exit(1);
	}

	// Number of columns in the global matrix A
	int N = std::strtol(argv[1], nullptr, 10);
	int M = N; // Number of rows in the global matrix A
	// block size for the matrix A
	int NB = std::strtol(argv[2], nullptr, 10);
	int MB = NB;
	// Number of rows and columns in the process grid
	int NPROW = std::strtol(argv[3], nullptr, 10);
	int NPCOL = std::strtol(argv[4], nullptr, 10);

	// Initialize the process grid
	int ICTXT, MYROW, MYCOL;
	sl_init_(&ICTXT, &NPROW, &NPCOL);
	blacs_gridinfo_(&ICTXT, &NPROW, &NPCOL, &MYROW, &MYCOL);

	int my_rank;
	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
	icecream::ic.prefix(
	"ic (", [my_rank] { return my_rank; }, "): ");
	IC(ICTXT, NPROW, NPCOL, MYROW, MYCOL);

	// If I'm not in the process grid, go to the end of the program
	if (MYROW == -1) {
	int blacs_exitcode = 0;
	blacs_exit_(&blacs_exitcode);
	exit(0);
	}

	// Do not ever do like this in real world. Use high performance random array
	// generators

	//*****
	// Distribute matrices to the process grid
	//*****

	int CSRC = 0, RSRC = 0; // root process coordinate
	int NRHS = 1; // Number of columns in the global solution matrix B

	// size of SUB_A
	int SUB_A_COLS = (N / (NB * NPCOL)) * NB + std::min(N % (NB * NPCOL), NB);
	int SUB_A_ROWS = (M / (MB * NPROW)) * MB + std::min(M % (MB * NPROW), MB);

	// size of SUB_B
	int SUB_B_ROWS = SUB_A_ROWS;
	int SUB_B_COLS = 1;

	IC(RSRC, CSRC, SUB_A_COLS, SUB_A_ROWS, SUB_A_ROWS, SUB_B_ROWS);

	// Initialize array descriptors
	int DESCA[9], DESCB[9];
	int INFO;
	descinit_(DESCA, &M, &N, &MB, &NB, &RSRC, &CSRC, &ICTXT, &SUB_A_ROWS, &INFO);
	descinit_(DESCB, &N, &NRHS, &MB, &NRHS, &RSRC, &CSRC, &ICTXT, &SUB_B_ROWS,
	&INFO);

	std::vector<int> IPIV(SUB_A_ROWS + NB);

	// COLUMN-MAJOR matrices
	std::vector<double> SUB_A(SUB_A_ROWS * SUB_A_COLS, 0.0);
	std::vector<double> SUB_B(SUB_B_ROWS * SUB_B_COLS, 0.0);

	// convert submatrix index (i,j) at process (p_row, p_col) into global
	// coordinate (I,J)
	auto to_global_coordinate = [SUB_A_ROWS, SUB_A_COLS, MB, NB, NPROW,
	NPCOL](int i, int j, int p_row,
	int p_col) -> std::pair<int, int> {
	// block coordinate (bi, bj)
	int bi = i / MB;
	int bj = j / NB;
	// local coordinate inside the block
	int ii = i % MB;
	int jj = j % NB;
	// calculate global coordinate
	int I = bi * (MB * NPROW) + p_row * MB + ii;
	int J = bj * (NB * NPCOL) + p_col * NB + jj;
	return std::make_pair(I, J);
	// set value to local matrix
	};

	std::seed_seq seq({123456789ull, static_cast<unsigned long long>(my_rank)});
	std::mt19937_64 rnd(seq);
	std::uniform_real_distribution<double> uni;

	// generate SUB_A
	for (size_t i = 0; i < SUB_A_ROWS; i++) {
	for (size_t j = 0; j < SUB_A_COLS; j++) {
	auto IJ = to_global_coordinate(i, j, MYROW, MYCOL);
	int I = IJ.first, J = IJ.second;
	if (I < M && J < N) {
	SUB_A[i + j * SUB_A_ROWS] = uni(rnd);
	}
	}
	}
	// generate SUB_B
	for (size_t i = 0; i < SUB_B_ROWS; i++) {
	int j = 0;
	auto IJ = to_global_coordinate(i, j, MYROW, MYCOL);
	int I = IJ.first, J = IJ.second;
	if (I < M && J < N) {
	SUB_B[i] = uni(rnd);
	}
	}

	if (my_rank == 1) {
	IC(SUB_A, SUB_B);
	}

	//*****
	// Call the SCALAPACK routine
	//*****
	int IA = 1;
	int JA = 1;
	int IB = 1;
	int JB = 1;
	int LDA = 1;
	auto t0 = std::chrono::system_clock::now();
	pdgesv_(&N, &NRHS, SUB_A.data(), &IA, &JA, DESCA, IPIV.data(), SUB_B.data(),
	&IB, &JB, DESCB, &INFO);
	auto t1 = std::chrono::system_clock::now();
	double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count();
	if (my_rank == 0) std::cerr << "ELAPSED: " << elapsed << " ms" << std::endl;

	std::vector<double> X(M, 0.0);

	auto copy_SUB_B_to_X = [&X, &SUB_B, SUB_B_ROWS, SUB_B_COLS, M, NRHS,
	&to_global_coordinate](int p_row, int p_col) {
	for (int i = 0; i < SUB_B_ROWS; i++) {
	for (int j = 0; j < SUB_B_COLS; j++) {
	auto IJ = to_global_coordinate(i, j, p_row, p_col);
	// set value to local matrix. Note SUB_A is column major
	int I = IJ.first, J = IJ.second;
	if (I >= M \|\| J >= NRHS)
	continue;
	X[I * NRHS + J] = SUB_B[i + j * SUB_B_ROWS];
	}
	}
	};

	// collect SUB_B
	copy_SUB_B_to_X(MYROW, MYCOL);
	std::vector<double> XX(M, 0.0);
	MPI_Allreduce(X.data(), XX.data(), X.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

	if ((MYROW == 0) && (MYCOL == 0)) {
	std::cout << "Matrix X:\n";
	for (int i = 0; i < M; i++) {
	if (i < 32)
	std::cout << XX[i] << ' ';
	}
	std::cout << std::endl;
	}

	// Release the process grid
	blacs_gridexit_(&ICTXT);
	int blacs_exitcode = 0;
	blacs_exit_(&blacs_exitcode);

	return 0;
	}