Skip to content

Instantly share code, notes, and snippets.

@yohm
Last active October 16, 2021 02:35
Show Gist options
  • Save yohm/55cdf2805fab8a92e1b617d1308d49a2 to your computer and use it in GitHub Desktop.
Save yohm/55cdf2805fab8a92e1b617d1308d49a2 to your computer and use it in GitHub Desktop.
a sample program of PDGESV of Scalapack in C++
#!/bin/bash -eux
mpicc pdgesv_example.cpp -lscalapack
mpirun -np 6 ./a.out 9 2 2 3
#include <cstdlib>
#include <iostream>
#include <mpi.h>
#include <random>
#include <vector>
#include <chrono>
#include "icecream-cpp/icecream.hpp"
// Declaration of Fortran subroutines
extern "C" {
void sl_init_(int *icontext, int *nprow, int *npcolumn);
// SL_INIT initializes an NPROW x NPCOL process grid using a row-major ordering
// of the processes. This routine retrieves a default system context which will
// include all available processes. (out) ictxt, (in) nprow, npcolumn
void blacs_gridinfo_(int *icontext, int *nprow, int *npcolumn, int *myrow,
int *mycolumn);
// (in) icontext: BLACS context
// (out) nprow, npcolumn: the numbers of rows and columns in this process grid
// (out) myrow, mycolumn: the process grid row- and column-index
void blacs_exit_(int *cont);
// (in) continue: if 0, all the resources are released. If nonzero, MPI
// resources are not released.
void blacs_gridexit_(int *icontext);
// (in) icontext: BLACS context
void descinit_(int *desc, int *m, int *n, int *mb, int *nb, int *irsrc,
int *icsrc, int *icontext, int *lld, int *info);
// (out) descriptor for the global matrix. `desc` must be an array of int of
// length 9. int[9]
// (in) m, n: rows and columns of the matrix (in) mb, nb: row,
// column block sizes
// (in) irsrc, icsrc: the process row (column) over which the
// first row of the global matrix is distributed.
// (in) icontext: BLACS context
// (in) lld: leading dimension of the local array
// (out) info: 0 => completed successfully
void dgesd2d_(int *icontext, int *m, int *n, double *A, int *lda, int *r_dest,
int *c_dest);
// Takes a general rectangular matrix and sends it to the destination process.
// (in) icontext: BLACS context
// (in) m,n: matrix sizes
// (in) A: matrix
// (in) lda: leading dimension (m)
// (in) r_dest, c_dest: the process corrdinate of the process to send the
// message to
void dgerv2d_(int *icontext, int *m, int *n, double *A, int *lda, int *row_src,
int *col_src);
// Receives a message from the process into the general rectangular matrix.
// (in) icontext: BLACS context
// (in) m,n,lda: sizes of the matrix
// (out) A: matrix
// (in) row_src, col_src: the process coordinate of the source of the message
void pdgesv_(int *n, int *nrhs, double *A, int *ia, int *ja, int desc_a[9],
int *ipvt, double *B, int *ib, int *jb, int desc_b[9], int *info);
// These subroutines solve the following systems of equations for multiple
// right-hand sides: AX = B
// (in) n: order of the submatrix = the number of rows of B
// (in) nrhs: the number of columns of B
// (in/out) A: the local part of the global general matrix A.
// (in) ia, ja: the row and the column indices of the
// global matrix A, identifying the first row and column of the submatrix A.
// (in) desc_a: descriptor of A matrix
// (out) ipvt: the local part of the global vector ipvt, containing the pivot
// indices.
// (in/out) B: the local part of the global general matrix B,
// containing the right-hand sides of the system.
// (in) ib, jb: the row and the column indices of the global matrix B,
// identifying the first row and column of the submatrix B.
// (in) desc_b: descriptor of B matrix (out) info: error code
}
int main(int argc, char *argv[]) {
icecream::ic.disable();
// Get command line arguments
if (argc != 5) {
exit(1);
}
// Number of columns in the global matrix A
int N = std::strtol(argv[1], nullptr, 10);
int M = N; // Number of rows in the global matrix A
// block size for the matrix A
int NB = std::strtol(argv[2], nullptr, 10);
int MB = NB;
// Number of rows and columns in the process grid
int NPROW = std::strtol(argv[3], nullptr, 10);
int NPCOL = std::strtol(argv[4], nullptr, 10);
// Initialize the process grid
int ICTXT, MYROW, MYCOL;
sl_init_(&ICTXT, &NPROW, &NPCOL);
blacs_gridinfo_(&ICTXT, &NPROW, &NPCOL, &MYROW, &MYCOL);
int my_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
icecream::ic.prefix(
"ic (", [my_rank] { return my_rank; }, "): ");
IC(ICTXT, NPROW, NPCOL, MYROW, MYCOL);
// If I'm not in the process grid, go to the end of the program
if (MYROW == -1) {
int blacs_exitcode = 0;
blacs_exit_(&blacs_exitcode);
exit(0);
}
// Do not ever do like this in real world. Use high performance random array
// generators
//*****
// Distribute matrices to the process grid
//*****
int CSRC = 0, RSRC = 0; // root process coordinate
int NRHS = 1; // Number of columns in the global solution matrix B
// size of SUB_A
int SUB_A_COLS = (N / (NB * NPCOL)) * NB + std::min(N % (NB * NPCOL), NB);
int SUB_A_ROWS = (M / (MB * NPROW)) * MB + std::min(M % (MB * NPROW), MB);
// size of SUB_B
int SUB_B_ROWS = SUB_A_ROWS;
int SUB_B_COLS = 1;
IC(RSRC, CSRC, SUB_A_COLS, SUB_A_ROWS, SUB_A_ROWS, SUB_B_ROWS);
// Initialize array descriptors
int DESCA[9], DESCB[9];
int INFO;
descinit_(DESCA, &M, &N, &MB, &NB, &RSRC, &CSRC, &ICTXT, &SUB_A_ROWS, &INFO);
descinit_(DESCB, &N, &NRHS, &MB, &NRHS, &RSRC, &CSRC, &ICTXT, &SUB_B_ROWS,
&INFO);
std::vector<int> IPIV(SUB_A_ROWS + NB);
// COLUMN-MAJOR matrices
std::vector<double> SUB_A(SUB_A_ROWS * SUB_A_COLS, 0.0);
std::vector<double> SUB_B(SUB_B_ROWS * SUB_B_COLS, 0.0);
// convert submatrix index (i,j) at process (p_row, p_col) into global
// coordinate (I,J)
auto to_global_coordinate = [SUB_A_ROWS, SUB_A_COLS, MB, NB, NPROW,
NPCOL](int i, int j, int p_row,
int p_col) -> std::pair<int, int> {
// block coordinate (bi, bj)
int bi = i / MB;
int bj = j / NB;
// local coordinate inside the block
int ii = i % MB;
int jj = j % NB;
// calculate global coordinate
int I = bi * (MB * NPROW) + p_row * MB + ii;
int J = bj * (NB * NPCOL) + p_col * NB + jj;
return std::make_pair(I, J);
// set value to local matrix
};
std::seed_seq seq({123456789ull, static_cast<unsigned long long>(my_rank)});
std::mt19937_64 rnd(seq);
std::uniform_real_distribution<double> uni;
// generate SUB_A
for (size_t i = 0; i < SUB_A_ROWS; i++) {
for (size_t j = 0; j < SUB_A_COLS; j++) {
auto IJ = to_global_coordinate(i, j, MYROW, MYCOL);
int I = IJ.first, J = IJ.second;
if (I < M && J < N) {
SUB_A[i + j * SUB_A_ROWS] = uni(rnd);
}
}
}
// generate SUB_B
for (size_t i = 0; i < SUB_B_ROWS; i++) {
int j = 0;
auto IJ = to_global_coordinate(i, j, MYROW, MYCOL);
int I = IJ.first, J = IJ.second;
if (I < M && J < N) {
SUB_B[i] = uni(rnd);
}
}
if (my_rank == 1) {
IC(SUB_A, SUB_B);
}
//*****
// Call the SCALAPACK routine
//*****
int IA = 1;
int JA = 1;
int IB = 1;
int JB = 1;
int LDA = 1;
auto t0 = std::chrono::system_clock::now();
pdgesv_(&N, &NRHS, SUB_A.data(), &IA, &JA, DESCA, IPIV.data(), SUB_B.data(),
&IB, &JB, DESCB, &INFO);
auto t1 = std::chrono::system_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count();
if (my_rank == 0) std::cerr << "ELAPSED: " << elapsed << " ms" << std::endl;
std::vector<double> X(M, 0.0);
auto copy_SUB_B_to_X = [&X, &SUB_B, SUB_B_ROWS, SUB_B_COLS, M, NRHS,
&to_global_coordinate](int p_row, int p_col) {
for (int i = 0; i < SUB_B_ROWS; i++) {
for (int j = 0; j < SUB_B_COLS; j++) {
auto IJ = to_global_coordinate(i, j, p_row, p_col);
// set value to local matrix. Note SUB_A is column major
int I = IJ.first, J = IJ.second;
if (I >= M || J >= NRHS)
continue;
X[I * NRHS + J] = SUB_B[i + j * SUB_B_ROWS];
}
}
};
// collect SUB_B
copy_SUB_B_to_X(MYROW, MYCOL);
std::vector<double> XX(M, 0.0);
MPI_Allreduce(X.data(), XX.data(), X.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
if ((MYROW == 0) && (MYCOL == 0)) {
std::cout << "Matrix X:\n";
for (int i = 0; i < M; i++) {
if (i < 32)
std::cout << XX[i] << ' ';
}
std::cout << std::endl;
}
// Release the process grid
blacs_gridexit_(&ICTXT);
int blacs_exitcode = 0;
blacs_exit_(&blacs_exitcode);
return 0;
}
#include "icecream-cpp/icecream.hpp"
#include <cstdlib>
#include <iostream>
#include <mpi.h>
#include <random>
#include <vector>
// Declaration of Fortran subroutines
extern "C" {
void sl_init_(int *icontext, int *nprow, int *npcolumn);
// SL_INIT initializes an NPROW x NPCOL process grid using a row-major ordering
// of the processes. This routine retrieves a default system context which will
// include all available processes. (out) ictxt, (in) nprow, npcolumn
void blacs_gridinfo_(int *icontext, int *nprow, int *npcolumn, int *myrow,
int *mycolumn);
// (in) icontext: BLACS context
// (out) nprow, npcolumn: the numbers of rows and columns in this process grid
// (out) myrow, mycolumn: the process grid row- and column-index
void blacs_exit_(int *cont);
// (in) continue: if 0, all the resources are released. If nonzero, MPI
// resources are not released.
void blacs_gridexit_(int *icontext);
// (in) icontext: BLACS context
void descinit_(int *desc, int *m, int *n, int *mb, int *nb, int *irsrc,
int *icsrc, int *icontext, int *lld, int *info);
// (out) descriptor for the global matrix. `desc` must be an array of int of
// length 9. int[9]
// (in) m, n: rows and columns of the matrix (in) mb, nb: row,
// column block sizes
// (in) irsrc, icsrc: the process row (column) over which the
// first row of the global matrix is distributed.
// (in) icontext: BLACS context
// (in) lld: leading dimension of the local array
// (out) info: 0 => completed successfully
void dgesd2d_(int *icontext, int *m, int *n, double *A, int *lda, int *r_dest,
int *c_dest);
// Takes a general rectangular matrix and sends it to the destination process.
// (in) icontext: BLACS context
// (in) m,n: matrix sizes
// (in) A: matrix
// (in) lda: leading dimension (m)
// (in) r_dest, c_dest: the process corrdinate of the process to send the
// message to
void dgerv2d_(int *icontext, int *m, int *n, double *A, int *lda, int *row_src,
int *col_src);
// Receives a message from the process into the general rectangular matrix.
// (in) icontext: BLACS context
// (in) m,n,lda: sizes of the matrix
// (out) A: matrix
// (in) row_src, col_src: the process coordinate of the source of the message
void pdgesv_(int *n, int *nrhs, double *A, int *ia, int *ja, int desc_a[9],
int *ipvt, double *B, int *ib, int *jb, int desc_b[9], int *info);
// These subroutines solve the following systems of equations for multiple
// right-hand sides: AX = B
// (in) n: order of the submatrix = the number of rows of B
// (in) nrhs: the number of columns of B
// (in/out) A: the local part of the global general matrix A.
// (in) ia, ja: the row and the column indices of the
// global matrix A, identifying the first row and column of the submatrix A.
// (in) desc_a: descriptor of A matrix
// (out) ipvt: the local part of the global vector ipvt, containing the pivot
// indices.
// (in/out) B: the local part of the global general matrix B,
// containing the right-hand sides of the system.
// (in) ib, jb: the row and the column indices of the global matrix B,
// identifying the first row and column of the submatrix B.
// (in) desc_b: descriptor of B matrix (out) info: error code
}
int main(int argc, char *argv[]) {
icecream::ic.disable();
// Get command line arguments
if (argc != 5) {
exit(1);
}
// Number of columns in the global matrix A
int N = std::strtol(argv[1], nullptr, 10);
int M = N; // Number of rows in the global matrix A
// block size for the matrix A
int NB = std::strtol(argv[2], nullptr, 10);
int MB = NB;
// Number of rows and columns in the process grid
int NPROW = std::strtol(argv[3], nullptr, 10);
int NPCOL = std::strtol(argv[4], nullptr, 10);
// Initialize the process grid
int ICTXT, MYROW, MYCOL;
sl_init_(&ICTXT, &NPROW, &NPCOL);
blacs_gridinfo_(&ICTXT, &NPROW, &NPCOL, &MYROW, &MYCOL);
int my_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
icecream::ic.prefix(
"ic (", [my_rank] { return my_rank; }, "): ");
IC(ICTXT, NPROW, NPCOL, MYROW, MYCOL);
// If I'm not in the process grid, go to the end of the program
if (MYROW == -1) {
int blacs_exitcode = 0;
blacs_exit_(&blacs_exitcode);
exit(0);
}
// global matrix A and B
std::vector<double> A;
std::vector<double> B;
// Generate matrices A and B
if (MYROW == 0 && MYCOL == 0) {
A.resize(M * N);
B.resize(M);
// Do not ever do like this in real world. Use high performance random array
// generators
std::mt19937_64 rnd(123456789ull);
std::uniform_real_distribution<double> uni;
std::cout << "Matrix A:\n";
// A is a row-major matrix (as in C,C++) but SUB_A is
// the column major matrix as in Fortran
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
// ROW-MAJOR matrix
A[i * N + j] = uni(rnd);
if (i < 32 && j < 32)
std::cout << A[i * N + j] << ' ';
}
if (i < 32)
std::cout << std::endl;
}
std::cout << "Matrix B:\n";
for (int i = 0; i < M; i++) {
B[i] = uni(rnd);
if (i < 32)
std::cout << B[i] << ' ';
}
std::cout << std::endl;
}
//*****
// Distribute matrices to the process grid
//*****
int CSRC = 0, RSRC = 0; // root process coordinate
int NRHS = 1; // Number of columns in the global solution matrix B
// size of SUB_A
int SUB_A_COLS = (N / (NB * NPCOL)) * NB + std::min(N % (NB * NPCOL), NB);
int SUB_A_ROWS = (M / (MB * NPROW)) * MB + std::min(M % (MB * NPROW), MB);
// size of SUB_B
int SUB_B_ROWS = SUB_A_ROWS;
int SUB_B_COLS = 1;
IC(RSRC, CSRC, SUB_A_COLS, SUB_A_ROWS, SUB_A_ROWS, SUB_B_ROWS);
// Initialize array descriptors
int DESCA[9], DESCB[9];
int INFO;
descinit_(DESCA, &M, &N, &MB, &NB, &RSRC, &CSRC, &ICTXT, &SUB_A_ROWS, &INFO);
descinit_(DESCB, &N, &NRHS, &MB, &NRHS, &RSRC, &CSRC, &ICTXT, &SUB_B_ROWS,
&INFO);
std::vector<int> IPIV(SUB_A_ROWS + NB);
// COLUMN-MAJOR matrices
std::vector<double> SUB_A(SUB_A_ROWS * SUB_A_COLS, 0.0);
std::vector<double> SUB_B(SUB_B_ROWS * SUB_B_COLS, 0.0);
// convert submatrix index (i,j) at process (p_row, p_col) into global
// coordinate (I,J)
auto to_global_coordinate = [SUB_A_ROWS, SUB_A_COLS, MB, NB, NPROW,
NPCOL](int i, int j, int p_row,
int p_col) -> std::pair<int, int> {
// block coordinate (bi, bj)
int bi = i / MB;
int bj = j / NB;
// local coordinate inside the block
int ii = i % MB;
int jj = j % NB;
// calculate global coordinate
int I = bi * (MB * NPROW) + p_row * MB + ii;
int J = bj * (NB * NPCOL) + p_col * NB + jj;
return std::make_pair(I, J);
// set value to local matrix
};
auto copy_A_to_SUB_A = [&A, &SUB_A, SUB_A_ROWS, SUB_A_COLS, M, N,
&to_global_coordinate](int p_row, int p_col) {
for (int i = 0; i < SUB_A_ROWS; i++) {
for (int j = 0; j < SUB_A_COLS; j++) {
auto IJ = to_global_coordinate(i, j, p_row, p_col);
// set value to local matrix. Note SUB_A is column major
int I = IJ.first, J = IJ.second;
if (I >= M || J >= N)
continue;
// IC(p_row, p_col, i, j, I, J);
SUB_A[i + j * SUB_A_ROWS] = A[I * N + J];
}
}
};
auto copy_B_to_SUB_B = [&B, &SUB_B, SUB_B_ROWS, SUB_B_COLS, M, NRHS,
&to_global_coordinate](int p_row, int p_col) {
for (int i = 0; i < SUB_B_ROWS; i++) {
for (int j = 0; j < SUB_B_COLS; j++) {
auto IJ = to_global_coordinate(i, j, p_row, p_col);
// set value to local matrix. Note SUB_A is column major
int I = IJ.first, J = IJ.second;
if (I >= M || J >= NRHS)
continue;
SUB_B[i + j * SUB_B_ROWS] = B[I * NRHS + J];
}
}
};
auto copy_SUB_B_to_B = [&B, &SUB_B, SUB_B_ROWS, SUB_B_COLS, M, NRHS,
&to_global_coordinate](int p_row, int p_col) {
for (int i = 0; i < SUB_B_ROWS; i++) {
for (int j = 0; j < SUB_B_COLS; j++) {
auto IJ = to_global_coordinate(i, j, p_row, p_col);
// set value to local matrix. Note SUB_A is column major
int I = IJ.first, J = IJ.second;
if (I >= M || J >= NRHS)
continue;
B[I * NRHS + J] = SUB_B[i + j * SUB_B_ROWS];
}
}
};
// send SUB_A
int LDA = 1; // The distance between two elements in matrix row
if (MYROW == 0 && MYCOL == 0) {
for (int p_row = 0; p_row < NPROW; p_row++) {
for (int p_col = 0; p_col < NPCOL; p_col++) {
if (p_row == 0 && p_col == 0)
continue;
copy_A_to_SUB_A(p_row, p_col);
dgesd2d_(&ICTXT, &SUB_A_ROWS, &SUB_A_COLS, SUB_A.data(), &LDA, &p_row,
&p_col);
}
}
copy_A_to_SUB_A(0, 0); // set local submatrix for rank 0
} else {
dgerv2d_(&ICTXT, &SUB_A_ROWS, &SUB_A_COLS, SUB_A.data(), &LDA, &RSRC,
&CSRC);
}
// distribute SUB_B
if (MYROW == 0 && MYCOL == 0) {
for (int p_row = 1; p_row < NPROW; p_row++) {
int p_col = 0;
copy_B_to_SUB_B(p_row, p_col);
dgesd2d_(&ICTXT, &SUB_B_ROWS, &SUB_B_COLS, SUB_B.data(), &LDA, &p_row,
&p_col);
}
copy_B_to_SUB_B(0, 0);
} else if (MYCOL == 0) {
dgerv2d_(&ICTXT, &SUB_B_ROWS, &SUB_B_COLS, SUB_B.data(), &LDA, &RSRC,
&CSRC);
}
if (my_rank == 1) {
IC(SUB_A, SUB_B);
}
//*****
// Call the SCALAPACK routine
//*****
int IA = 1;
int JA = 1;
int IB = 1;
int JB = 1;
pdgesv_(&N, &NRHS, SUB_A.data(), &IA, &JA, DESCA, IPIV.data(), SUB_B.data(),
&IB, &JB, DESCB, &INFO);
// collect SUB_B
if (MYROW == 0 && MYCOL == 0) {
copy_SUB_B_to_B(0, 0);
for (int p_row = 1; p_row < NPROW; p_row++) {
int p_col = 0;
dgerv2d_(&ICTXT, &SUB_B_ROWS, &SUB_B_COLS, SUB_B.data(), &LDA, &p_row,
&p_col);
copy_SUB_B_to_B(p_row, p_col);
}
} else if (MYCOL == 0) {
dgesd2d_(&ICTXT, &SUB_B_ROWS, &SUB_B_COLS, SUB_B.data(), &LDA, &RSRC,
&CSRC);
}
if ((MYROW == 0) && (MYCOL == 0)) {
std::cout << "Matrix X:\n";
for (int i = 0; i < M; i++) {
if (i < 32)
std::cout << B[i] << ' ';
}
std::cout << std::endl;
std::cout << "Matrix AX:\n";
for (int i = 0; i < M; i++) {
double ans = 0.0;
if (i == 32)
break;
for (int j = 0; j < N; j++) {
ans += A[i * N + j] * B[j];
}
std::cout << ans << ' ';
}
std::cout << std::endl;
}
// Release the process grid
blacs_gridexit_(&ICTXT);
int blacs_exitcode = 0;
blacs_exit_(&blacs_exitcode);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment