Skip to content

Instantly share code, notes, and snippets.

@pervognsen
Last active April 3, 2020 00:53
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pervognsen/9a6a5d014beff4c23bdcac97bff4b3a7 to your computer and use it in GitHub Desktop.
Save pervognsen/9a6a5d014beff4c23bdcac97bff4b3a7 to your computer and use it in GitHub Desktop.
/*
transpose_ij (10000x5000): gmemops=0.23, min=0.436734, avg=0.455368, relerr=3.42%
transpose_ji (10000x5000): gmemops=0.28, min=0.356635, avg=0.363628, relerr=2.55%
transpose_ij_ij (10000x5000): gmemops=1.53, min=0.065207, avg=0.069465, relerr=6.07%
transpose_rec1 (10000x5000): gmemops=1.44, min=0.069258, avg=0.075378, relerr=8.96%
transpose_rec2 (10000x5000): gmemops=1.37, min=0.072819, avg=0.079644, relerr=8.77%
transpose_ij (100000x100): gmemops=1.70, min=0.011731, avg=0.014102, relerr=9.99%
transpose_ji (100000x100): gmemops=0.23, min=0.086909, avg=0.095706, relerr=3.37%
transpose_ij_ij (100000x100): gmemops=1.73, min=0.011543, avg=0.013170, relerr=6.96%
transpose_rec1 (100000x100): gmemops=0.76, min=0.026243, avg=0.028419, relerr=5.23%
transpose_rec2 (100000x100): gmemops=1.85, min=0.010837, avg=0.012862, relerr=9.02%
gmemops is a normalized performance measure: 2e-9*m*n/runtime, where 2*m*n memory ops is the minimum for mxn transpose.
I measured ICC's memcpy for 10000x5000 at 4.9 gmemops and a simple *dest++ = *src++ memcpy at 3.4 gmemops.
So these numbers are currently far off from the theoretical speed of light for data movement.
*/
void transpose_ij(Matrix A, Matrix B) {
int m = A.m;
int n = A.n;
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
A(i, j) = B(j, i);
}
}
}
void transpose_ji(Matrix A, Matrix B) {
int m = A.m;
int n = A.n;
for (int j = 0; j < n; j++) {
for (int i = 0; i < m; i++) {
A(i, j) = B(j, i);
}
}
}
void transpose_ij_ij(Matrix A, Matrix B) {
int m = A.m;
int n = A.n;
int mb = 128;
int nb = 128;
for (int i = 0; i < m; i += mb) {
for (int j = 0; j < n; j += nb) {
Matrix Aij = A.block(i, j, mb, nb);
Matrix Bji = B.block(j, i, nb, mb);
transpose_ij(Aij, Bji);
}
}
}
void transpose_rec1(Matrix A, Matrix B) {
int m = A.m;
int n = A.n;
if (m < 16 || n < 16) {
transpose_ij(A, B);
return;
}
int i = m / 2;
int j = n / 2;
Matrix A00 = A.block(0, 0, i, j);
Matrix B00 = B.block(0, 0, j, i);
transpose_rec1(A00, B00);
Matrix A01 = A.block(0, j, i, n-j);
Matrix B10 = B.block(j, 0, n-j, i);
transpose_rec1(A01, B10);
Matrix A10 = A.block(i, 0, m-i, j);
Matrix B01 = B.block(0, i, j, m-i);
transpose_rec1(A10, B01);
Matrix A11 = A.block(i, j, m-i, n-j);
Matrix B11 = B.block(j, i, n-j, m-i);
transpose_rec1(A11, B11);
}
void transpose_rec2(Matrix A, Matrix B) {
int m = A.m;
int n = A.n;
if (m < 16 || n < 16) {
transpose_ij(A, B);
return;
}
if (m >= n) {
int i = m / 2;
Matrix A0 = A.block(0, 0, i, n);
Matrix B0 = B.block(0, 0, n, i);
transpose_rec2(A0, B0);
Matrix A1 = A.block(i, 0, m-i, n);
Matrix B1 = B.block(0, i, n, m-i);
transpose_rec2(A1, B1);
} else {
int j = n / 2;
Matrix A0 = A.block(0, 0, m, j);
Matrix B0 = B.block(0, 0, j, m);
transpose_rec2(A0, B0);
Matrix A1 = A.block(0, j, m, n-j);
Matrix B1 = B.block(j, 0, n-j, m);
transpose_rec2(A1, B1);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment