Created
May 19, 2020 12:14
-
-
Save hokru/3f16adf5505f49df95ceee024f75b200 to your computer and use it in GitHub Desktop.
test program for gg's transpose function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <random> | |
#include <algorithm> | |
#include <iterator> | |
#include <iostream> | |
#include <functional> | |
#include <ctime> | |
#define ALIGNED_MALLOC(alignment, size) _mm_malloc(size, alignment) | |
#define ALIGNED_FREE(ptr) _mm_free(ptr) | |
#define ASSUME_ALIGNED(ptr, width) __assume_aligned(ptr, width) | |
#define PRAGMA_VECTORIZE _Pragma("vector") | |
#define PRAGMA_RESTRICT | |
void gg_naive_transpose(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input, | |
double* PRAGMA_RESTRICT output) { | |
ASSUME_ALIGNED(input, 64); | |
for (unsigned long i = 0; i < n; i++) { | |
for (unsigned long j = 0; j < m; j++) { | |
output[j * n + i] = input[i * m + j]; | |
} | |
} | |
} | |
void gg_fast_transpose(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input, | |
double* PRAGMA_RESTRICT output) { | |
double tmp[64] __attribute__((aligned(64))); | |
ASSUME_ALIGNED(input, 64); | |
// Sizing | |
unsigned long nblocks = n / 8; | |
nblocks += (n % 8) ? 1 : 0; | |
unsigned long mblocks = m / 8; | |
mblocks += (m % 8) ? 1 : 0; | |
// Outer blocks | |
for (unsigned long nb = 0; nb < nblocks; nb++) { | |
const unsigned long nstart = nb * 8; | |
unsigned long nremain = ((nstart + 8) > n) ? (n - nstart) : 8; | |
for (unsigned long mb = 0; mb < mblocks; mb++) { | |
const unsigned long mstart = mb * 8; | |
unsigned long mremain = ((mstart + 8) > m) ? (m - mstart) : 8; | |
// Copy data to inner block | |
for (unsigned long l = 0; l < nremain; l++) { | |
const unsigned long start = (nstart + l) * m + mstart; | |
for (unsigned long k = 0; k < mremain; k++) { | |
tmp[k * 8 + l] = input[start + k]; | |
} | |
} | |
// Copy data to inner block | |
for (unsigned long k = 0; k < mremain; k++) { | |
const unsigned long start = (mstart + k) * n + nstart; | |
for (unsigned long l = 0; l < nremain; l++) { | |
output[start + l] = tmp[k * 8 + l]; | |
} | |
} | |
} | |
} | |
} | |
double fRand(double fMin, double fMax) { | |
double f = (double)rand() / RAND_MAX; | |
return fMin + f * (fMax - fMin); | |
} | |
void main() { | |
std::srand(std::time(nullptr)); | |
size_t n; | |
size_t m; | |
size_t nloops; | |
bool do_naive; | |
printf("specify: n, m, nloops, do_naive \n"); | |
std::cin >> n; | |
std::cin >> m; | |
std::cin >> nloops; | |
std::cin >> do_naive; | |
printf("Matrix size %d x %xd", n, m); | |
double *v = new double[n * m]; | |
double *t = new double[n * m]; | |
for (auto i = 0; i < n; i++) { | |
for (auto j = 0; j < m; j++) { | |
auto idx = j * n + i; | |
v[idx] = fRand(0, 1); | |
// printf("%lf \n",v[idx]); | |
} | |
} | |
if (do_naive) { | |
for (auto k = 0; k < nloops; k++) { | |
gg_naive_transpose(n, m, v, t); | |
} | |
} else { | |
for (auto k = 0; k < nloops; k++) { | |
gg_fast_transpose(n, m, v, t); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment