Last active
April 3, 2020 00:53
-
-
Save pervognsen/9a6a5d014beff4c23bdcac97bff4b3a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
transpose_ij (10000x5000): gmemops=0.23, min=0.436734, avg=0.455368, relerr=3.42% | |
transpose_ji (10000x5000): gmemops=0.28, min=0.356635, avg=0.363628, relerr=2.55% | |
transpose_ij_ij (10000x5000): gmemops=1.53, min=0.065207, avg=0.069465, relerr=6.07% | |
transpose_rec1 (10000x5000): gmemops=1.44, min=0.069258, avg=0.075378, relerr=8.96% | |
transpose_rec2 (10000x5000): gmemops=1.37, min=0.072819, avg=0.079644, relerr=8.77% | |
transpose_ij (100000x100): gmemops=1.70, min=0.011731, avg=0.014102, relerr=9.99% | |
transpose_ji (100000x100): gmemops=0.23, min=0.086909, avg=0.095706, relerr=3.37% | |
transpose_ij_ij (100000x100): gmemops=1.73, min=0.011543, avg=0.013170, relerr=6.96% | |
transpose_rec1 (100000x100): gmemops=0.76, min=0.026243, avg=0.028419, relerr=5.23% | |
transpose_rec2 (100000x100): gmemops=1.85, min=0.010837, avg=0.012862, relerr=9.02% | |
gmemops is a normalized performance measure: 2e-9*m*n/runtime, where 2*m*n memory ops is the minimum for mxn transpose. | |
I measured ICC's memcpy for 10000x5000 at 4.9 gmemops and a simple *dest++ = *src++ memcpy at 3.4 gmemops. | |
So these numbers are currently far off from the theoretical speed of light for data movement. | |
*/ | |
void transpose_ij(Matrix A, Matrix B) { | |
int m = A.m; | |
int n = A.n; | |
for (int i = 0; i < m; i++) { | |
for (int j = 0; j < n; j++) { | |
A(i, j) = B(j, i); | |
} | |
} | |
} | |
void transpose_ji(Matrix A, Matrix B) { | |
int m = A.m; | |
int n = A.n; | |
for (int j = 0; j < n; j++) { | |
for (int i = 0; i < m; i++) { | |
A(i, j) = B(j, i); | |
} | |
} | |
} | |
void transpose_ij_ij(Matrix A, Matrix B) { | |
int m = A.m; | |
int n = A.n; | |
int mb = 128; | |
int nb = 128; | |
for (int i = 0; i < m; i += mb) { | |
for (int j = 0; j < n; j += nb) { | |
Matrix Aij = A.block(i, j, mb, nb); | |
Matrix Bji = B.block(j, i, nb, mb); | |
transpose_ij(Aij, Bji); | |
} | |
} | |
} | |
void transpose_rec1(Matrix A, Matrix B) { | |
int m = A.m; | |
int n = A.n; | |
if (m < 16 || n < 16) { | |
transpose_ij(A, B); | |
return; | |
} | |
int i = m / 2; | |
int j = n / 2; | |
Matrix A00 = A.block(0, 0, i, j); | |
Matrix B00 = B.block(0, 0, j, i); | |
transpose_rec1(A00, B00); | |
Matrix A01 = A.block(0, j, i, n-j); | |
Matrix B10 = B.block(j, 0, n-j, i); | |
transpose_rec1(A01, B10); | |
Matrix A10 = A.block(i, 0, m-i, j); | |
Matrix B01 = B.block(0, i, j, m-i); | |
transpose_rec1(A10, B01); | |
Matrix A11 = A.block(i, j, m-i, n-j); | |
Matrix B11 = B.block(j, i, n-j, m-i); | |
transpose_rec1(A11, B11); | |
} | |
void transpose_rec2(Matrix A, Matrix B) { | |
int m = A.m; | |
int n = A.n; | |
if (m < 16 || n < 16) { | |
transpose_ij(A, B); | |
return; | |
} | |
if (m >= n) { | |
int i = m / 2; | |
Matrix A0 = A.block(0, 0, i, n); | |
Matrix B0 = B.block(0, 0, n, i); | |
transpose_rec2(A0, B0); | |
Matrix A1 = A.block(i, 0, m-i, n); | |
Matrix B1 = B.block(0, i, n, m-i); | |
transpose_rec2(A1, B1); | |
} else { | |
int j = n / 2; | |
Matrix A0 = A.block(0, 0, m, j); | |
Matrix B0 = B.block(0, 0, j, m); | |
transpose_rec2(A0, B0); | |
Matrix A1 = A.block(0, j, m, n-j); | |
Matrix B1 = B.block(j, 0, n-j, m); | |
transpose_rec2(A1, B1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment