Skip to content

Instantly share code, notes, and snippets.

@hokru
Created May 19, 2020 12:14
Show Gist options
  • Save hokru/3f16adf5505f49df95ceee024f75b200 to your computer and use it in GitHub Desktop.
Save hokru/3f16adf5505f49df95ceee024f75b200 to your computer and use it in GitHub Desktop.
test program for gg's transpose function
#include <random>
#include <algorithm>
#include <iterator>
#include <iostream>
#include <functional>
#include <ctime>
#define ALIGNED_MALLOC(alignment, size) _mm_malloc(size, alignment)
#define ALIGNED_FREE(ptr) _mm_free(ptr)
#define ASSUME_ALIGNED(ptr, width) __assume_aligned(ptr, width)
#define PRAGMA_VECTORIZE _Pragma("vector")
#define PRAGMA_RESTRICT
void gg_naive_transpose(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input,
double* PRAGMA_RESTRICT output) {
ASSUME_ALIGNED(input, 64);
for (unsigned long i = 0; i < n; i++) {
for (unsigned long j = 0; j < m; j++) {
output[j * n + i] = input[i * m + j];
}
}
}
void gg_fast_transpose(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input,
double* PRAGMA_RESTRICT output) {
double tmp[64] __attribute__((aligned(64)));
ASSUME_ALIGNED(input, 64);
// Sizing
unsigned long nblocks = n / 8;
nblocks += (n % 8) ? 1 : 0;
unsigned long mblocks = m / 8;
mblocks += (m % 8) ? 1 : 0;
// Outer blocks
for (unsigned long nb = 0; nb < nblocks; nb++) {
const unsigned long nstart = nb * 8;
unsigned long nremain = ((nstart + 8) > n) ? (n - nstart) : 8;
for (unsigned long mb = 0; mb < mblocks; mb++) {
const unsigned long mstart = mb * 8;
unsigned long mremain = ((mstart + 8) > m) ? (m - mstart) : 8;
// Copy data to inner block
for (unsigned long l = 0; l < nremain; l++) {
const unsigned long start = (nstart + l) * m + mstart;
for (unsigned long k = 0; k < mremain; k++) {
tmp[k * 8 + l] = input[start + k];
}
}
// Copy data to inner block
for (unsigned long k = 0; k < mremain; k++) {
const unsigned long start = (mstart + k) * n + nstart;
for (unsigned long l = 0; l < nremain; l++) {
output[start + l] = tmp[k * 8 + l];
}
}
}
}
}
double fRand(double fMin, double fMax) {
double f = (double)rand() / RAND_MAX;
return fMin + f * (fMax - fMin);
}
void main() {
std::srand(std::time(nullptr));
size_t n;
size_t m;
size_t nloops;
bool do_naive;
printf("specify: n, m, nloops, do_naive \n");
std::cin >> n;
std::cin >> m;
std::cin >> nloops;
std::cin >> do_naive;
printf("Matrix size %d x %xd", n, m);
double *v = new double[n * m];
double *t = new double[n * m];
for (auto i = 0; i < n; i++) {
for (auto j = 0; j < m; j++) {
auto idx = j * n + i;
v[idx] = fRand(0, 1);
// printf("%lf \n",v[idx]);
}
}
if (do_naive) {
for (auto k = 0; k < nloops; k++) {
gg_naive_transpose(n, m, v, t);
}
} else {
for (auto k = 0; k < nloops; k++) {
gg_fast_transpose(n, m, v, t);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment