Skip to content

Instantly share code, notes, and snippets.

@catree
Last active September 30, 2019 11:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save catree/33942c1b4dd5ed2581e29de147030a1a to your computer and use it in GitHub Desktop.
Save catree/33942c1b4dd5ed2581e29de147030a1a to your computer and use it in GitHub Desktop.
Cache-friendly matrix transpose
#define CATCH_CONFIG_ENABLE_BENCHMARKING
#define CATCH_CONFIG_RUNNER
#include "catch.hpp"
namespace {
bool equal(double x, double y, double tol)
{
return std::fabs(x - y) < tol;
}
class MatrixXd {
public:
MatrixXd() :
m_data(), m_rows(0), m_cols(0) {}
MatrixXd(int row, int col) :
m_data(row*col), m_rows(row), m_cols(col) {}
double& operator() (int row, int col)
{
return m_data[row*m_cols + col];
}
double operator() (int row, int col) const
{
return m_data[row*m_cols + col];
}
MatrixXd transposeSrc() const
{
MatrixXd At(m_cols, m_rows);
for (int i = 0; i < m_rows; i++) {
for (int j = 0; j < m_cols; j++) {
At(j, i) = (*this)(i, j);
}
}
return At;
}
MatrixXd transposeDst() const
{
MatrixXd At(m_cols, m_rows);
for (int i = 0; i < m_cols; i++) {
for (int j = 0; j < m_rows; j++) {
At(i, j) = (*this)(j, i);
}
}
return At;
}
MatrixXd transposeTiling(int tileSize = 16) const
{
MatrixXd At(m_cols, m_rows);
for (int i = 0; i < m_rows;) {
for (; i <= m_rows - tileSize; i += tileSize) {
int j = 0;
for (; j <= m_cols - tileSize; j += tileSize) {
for (int k = i; k < i + tileSize; k++) {
for (int l = j; l < j + tileSize; l++) {
At(l, k) = (*this)(k, l);
}
}
}
for (int k = i; k < i + tileSize; k++) {
for (int l = j; l < m_cols; l++) {
At(l, k) = (*this)(k, l);
}
}
}
for (; i < m_rows; i++) {
for (int j = 0; j < m_cols; j++) {
At(j, i) = (*this)(i, j);
}
}
}
return At;
}
MatrixXd transposeTilingSO(int tileSize = 16) const
{
MatrixXd out(m_cols, m_rows);
for (int i = 0; i < m_rows; i += tileSize) {
for (int j = 0; j < m_cols; ++j) {
for (int b = 0; b < tileSize && i + b < m_rows; ++b) {
out.m_data[j*m_rows + i + b] = (*this).m_data[(i + b)*m_cols + j];
}
}
}
return out;
}
MatrixXd transposeOptim(int tileSize = 16) const
{
if (m_rows > 2 * m_cols && m_cols <= 64) {
return transposeSrc();
} else if (m_cols > 2 * m_rows && m_rows <= 64) {
return transposeDst();
} else if (m_rows % tileSize == 0) {
return transposeTilingSO();
} else {
return transposeTiling();
}
}
bool operator==(const MatrixXd& b) const
{
if (b.m_rows != m_rows || b.m_cols != m_cols) {
return false;
}
for (int i = 0; i < m_rows; i++) {
for (int j = 0; j < m_cols; j++) {
if (!equal((*this)(i, j), b(i, j), std::numeric_limits<double>::epsilon())) {
return false;
}
}
}
return true;
}
std::vector<double> m_data;
int m_rows;
int m_cols;
};
MatrixXd generateMatrix(int sz1, int sz2) {
MatrixXd M(sz1, sz2);
for (int i = 0; i < M.m_rows; i++) {
for (int j = 0; j < M.m_cols; j++) {
M(i, j) = i * M.m_cols + j;
}
}
return M;
}
MatrixXd generateMatrixTranspose(int sz1, int sz2) {
MatrixXd M(sz2, sz1);
for (int j = 0; j < M.m_cols; j++) {
for (int i = 0; i < M.m_rows; i++) {
M(i, j) = j * M.m_rows + i;
}
}
return M;
}
TEST_CASE("Benchmark matrix transpose", "[benchmark]") {
const std::vector<std::pair<int, int>> sizes = { {701, 1503}, {1791, 837}, {1201, 1201}, {1024, 1024}, {2000, 2000},
{10, 6}, {100, 6}, {500, 6}, {1000, 6}, {2000, 6},
{10, 64}, {100, 64}, {500, 64}, {1000, 64}, {2000, 64},
{6, 10}, {6, 100}, {6, 500}, {6, 1000}, {6, 2000},
{640, 1000}, {800, 640}, {640, 500}, {500, 640}, {640, 837}, {837, 640}
};
for (auto sz : sizes) {
MatrixXd M = generateMatrix(sz.first, sz.second);
MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second);
std::ostringstream oss;
oss << sz.first << "x" << sz.second;
oss << " - M.transposeSrc()";
BENCHMARK(oss.str().c_str()) {
MatrixXd Mt = M.transposeSrc();
REQUIRE(Mt == Mt_true);
return Mt;
};
oss.str("");
oss << sz.first << "x" << sz.second;
oss << " - M.transposeDst()";
BENCHMARK(oss.str().c_str()) {
MatrixXd Mt = M.transposeDst();
REQUIRE(Mt == Mt_true);
return Mt;
};
oss.str("");
oss << sz.first << "x" << sz.second;
oss << " - M.transposeTilingSO()";
BENCHMARK(oss.str().c_str()) {
MatrixXd Mt = M.transposeTilingSO();
REQUIRE(Mt == Mt_true);
return Mt;
};
oss.str("");
oss << sz.first << "x" << sz.second;
oss << " - M.transposeTiling()";
BENCHMARK(oss.str().c_str()) {
MatrixXd Mt = M.transposeTiling();
REQUIRE(Mt == Mt_true);
return Mt;
};
oss.str("");
oss << sz.first << "x" << sz.second;
oss << " - M.transposeOptim()";
BENCHMARK(oss.str().c_str()) {
MatrixXd Mt = M.transposeOptim();
REQUIRE(Mt == Mt_true);
return Mt;
};
}
//for (auto sz : sizes) {
// MatrixXd M = generateMatrix(sz.first, sz.second);
// MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second);
// std::ostringstream oss;
// oss << sz.first << "x" << sz.second;
// oss << " - M.transposeSrc()";
// BENCHMARK(oss.str().c_str()) {
// MatrixXd Mt = M.transposeSrc();
// REQUIRE(Mt == Mt_true);
// return Mt;
// };
//}
//for (auto sz : sizes) {
// MatrixXd M = generateMatrix(sz.first, sz.second);
// MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second);
// std::ostringstream oss;
// oss << sz.first << "x" << sz.second;
// oss << " - M.transposeDst()";
// BENCHMARK(oss.str().c_str()) {
// MatrixXd Mt = M.transposeDst();
// REQUIRE(Mt == Mt_true);
// return Mt;
// };
//}
//for (auto sz : sizes) {
// MatrixXd M = generateMatrix(sz.first, sz.second);
// MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second);
// std::ostringstream oss;
// oss << sz.first << "x" << sz.second;
// oss << " - M.transposeTilingSO()";
// BENCHMARK(oss.str().c_str()) {
// MatrixXd Mt = M.transposeTilingSO();
// REQUIRE(Mt == Mt_true);
// return Mt;
// };
//}
//for (auto sz : sizes) {
// MatrixXd M = generateMatrix(sz.first, sz.second);
// MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second);
// std::ostringstream oss;
// oss << sz.first << "x" << sz.second;
// oss << " - M.transposeTiling()";
// BENCHMARK(oss.str().c_str()) {
// MatrixXd Mt = M.transposeTiling();
// REQUIRE(Mt == Mt_true);
// return Mt;
// };
//}
//for (auto sz : sizes) {
// MatrixXd M = generateMatrix(sz.first, sz.second);
// MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second);
// std::ostringstream oss;
// oss << sz.first << "x" << sz.second;
// oss << " - M.transposeOptim()";
// BENCHMARK(oss.str().c_str()) {
// MatrixXd Mt = M.transposeOptim();
// REQUIRE(Mt == Mt_true);
// return Mt;
// };
//}
}
}
int main(int argc, char *argv[])
{
Catch::Session session; // There must be exactly one instance
// Let Catch (using Clara) parse the command line
session.applyCommandLine(argc, argv);
int numFailed = session.run();
// numFailed is clamped to 255 as some unices only use the lower 8 bits.
// This clamping has already been applied, so just return it here
// You can also do any post run clean-up here
return numFailed;
}
benchmark name samples iterations estimated
mean low mean high mean
std dev low std dev high std dev
-------------------------------------------------------------------------------
701x1503 - M.transposeSrc() 100 1 1.33581 s
13.5217 ms 13.4686 ms 13.6241 ms
363.282 us 205.441 us 614.763 us
701x1503 - M.transposeDst() 100 1 1.46756 s
14.9272 ms 14.8749 ms 14.986 ms
282.704 us 246.542 us 330.304 us
701x1503 - M.transposeTilingSO() 100 1 1.20588 s
12.1736 ms 12.0986 ms 12.4693 ms
674.628 us 130.473 us 1.58296 ms
701x1503 - M.transposeTiling() 100 1 879.485 ms
8.80787 ms 8.7821 ms 8.8395 ms
145.482 us 120.471 us 177.312 us
701x1503 - M.transposeOptim() 100 1 875.261 ms
8.75884 ms 8.73439 ms 8.79353 ms
147.051 us 113.127 us 216.557 us
1791x837 - M.transposeSrc() 100 1 2.00429 s
20.1265 ms 20.0448 ms 20.3114 ms
593.232 us 224.222 us 1.04084 ms
1791x837 - M.transposeDst() 100 1 2.18822 s
21.9146 ms 21.8052 ms 22.0655 ms
648.663 us 507.725 us 852.9 us
1791x837 - M.transposeTilingSO() 100 1 1.7262 s
17.3702 ms 17.3125 ms 17.4566 ms
354.908 us 261.148 us 528.674 us
1791x837 - M.transposeTiling() 100 1 1.29939 s
12.8264 ms 12.7865 ms 12.8779 ms
231.128 us 185.655 us 303.622 us
1791x837 - M.transposeOptim() 100 1 1.27144 s
12.7916 ms 12.7492 ms 12.8483 ms
249.525 us 195.995 us 342.697 us
1201x1201 - M.transposeSrc() 100 1 1.8964 s
18.9402 ms 18.8951 ms 19.0037 ms
269.819 us 206.147 us 384.575 us
1201x1201 - M.transposeDst() 100 1 2.08132 s
20.8071 ms 20.7606 ms 20.8678 ms
269.753 us 217.186 us 350.209 us
1201x1201 - M.transposeTilingSO() 100 1 1.69206 s
16.4527 ms 16.3729 ms 16.7379 ms
686.675 us 194.436 us 1.57422 ms
1201x1201 - M.transposeTiling() 100 1 1.24112 s
12.382 ms 12.3463 ms 12.4389 ms
225.193 us 161.788 us 367.464 us
1201x1201 - M.transposeOptim() 100 1 1.25534 s
12.5325 ms 12.491 ms 12.5977 ms
262.328 us 189.402 us 448.495 us
1024x1024 - M.transposeSrc() 100 1 1.4795 s
14.5538 ms 14.4798 ms 14.6498 ms
426.128 us 343.01 us 538.435 us
1024x1024 - M.transposeDst() 100 1 1.78832 s
17.6619 ms 17.5781 ms 17.8555 ms
612.149 us 332.978 us 1.2434 ms
1024x1024 - M.transposeTilingSO() 100 1 827.986 ms
8.3198 ms 8.25111 ms 8.56124 ms
587.589 us 180.224 us 1.33455 ms
1024x1024 - M.transposeTiling() 100 1 1.0355 s
10.3063 ms 10.2595 ms 10.4512 ms
379.301 us 153.84 us 831.083 us
1024x1024 - M.transposeOptim() 100 1 831.858 ms
8.45701 ms 8.32702 ms 8.6856 ms
854.014 us 546.172 us 1.22753 ms
2000x2000 - M.transposeSrc() 100 1 5.76389 s
57.6496 ms 57.5226 ms 57.7993 ms
699.662 us 601.632 us 841.167 us
2000x2000 - M.transposeDst() 100 1 6.0115 s
60.5116 ms 60.264 ms 60.958 ms
1.65362 ms 1.09742 ms 2.96921 ms
2000x2000 - M.transposeTilingSO() 100 1 3.54605 s
35.3772 ms 35.3106 ms 35.4719 ms
402.03 us 308.713 us 620.57 us
2000x2000 - M.transposeTiling() 100 1 3.63679 s
36.0236 ms 35.7791 ms 36.4135 ms
1.5485 ms 1.09031 ms 2.21434 ms
2000x2000 - M.transposeOptim() 100 1 3.63542 s
36.1886 ms 35.9014 ms 36.6428 ms
1.81409 ms 1.29751 ms 2.72564 ms
10x6 - M.transposeSrc() 100 147 5.0274 ms
343 ns 336 ns 353 ns
42 ns 29 ns 64 ns
10x6 - M.transposeDst() 100 144 5.04 ms
347 ns 343 ns 355 ns
27 ns 18 ns 46 ns
10x6 - M.transposeTilingSO() 100 118 5.0268 ms
450 ns 443 ns 460 ns
44 ns 33 ns 57 ns
10x6 - M.transposeTiling() 100 145 5.0315 ms
345 ns 342 ns 350 ns
17 ns 10 ns 34 ns
10x6 - M.transposeOptim() 100 142 5.0268 ms
345 ns 341 ns 353 ns
30 ns 19 ns 48 ns
100x6 - M.transposeSrc() 100 26 5.1454 ms
1.896 us 1.867 us 1.945 us
188 ns 134 ns 277 ns
100x6 - M.transposeDst() 100 24 5.0472 ms
2.098 us 2.073 us 2.144 us
168 ns 110 ns 270 ns
100x6 - M.transposeTilingSO() 100 26 5.2208 ms
1.987 us 1.972 us 2.015 us
101 ns 64 ns 168 ns
100x6 - M.transposeTiling() 100 26 5.1038 ms
1.966 us 1.945 us 2.018 us
160 ns 53 ns 281 ns
100x6 - M.transposeOptim() 100 26 5.109 ms
1.923 us 1.901 us 1.957 us
136 ns 96 ns 201 ns
500x6 - M.transposeSrc() 100 6 5.6262 ms
9.285 us 9.112 us 9.558 us
1.089 us 768 ns 1.543 us
500x6 - M.transposeDst() 100 6 5.9478 ms
9.959 us 9.784 us 10.226 us
1.095 us 782 ns 1.468 us
500x6 - M.transposeTilingSO() 100 6 5.7468 ms
9.431 us 9.28 us 9.773 us
1.098 us 608 ns 2.182 us
500x6 - M.transposeTiling() 100 6 5.589 ms
9.401 us 9.228 us 9.713 us
1.147 us 762 ns 1.813 us
500x6 - M.transposeOptim() 100 6 5.5716 ms
9.204 us 9.061 us 9.437 us
914 ns 645 ns 1.384 us
1000x6 - M.transposeSrc() 100 3 5.5023 ms
17.883 us 17.613 us 18.513 us
1.979 us 1.061 us 3.911 us
1000x6 - M.transposeDst() 100 3 5.9508 ms
19.244 us 18.878 us 20.101 us
2.696 us 1.43 us 5.379 us
1000x6 - M.transposeTilingSO() 100 3 5.5887 ms
19.032 us 18.709 us 19.69 us
2.26 us 1.233 us 3.69 us
1000x6 - M.transposeTiling() 100 3 5.4729 ms
20.114 us 19.272 us 21.402 us
5.223 us 3.888 us 7.422 us
1000x6 - M.transposeOptim() 100 3 5.4549 ms
18.294 us 17.996 us 18.954 us
2.146 us 1.134 us 3.901 us
2000x6 - M.transposeSrc() 100 2 7.1956 ms
37.673 us 36.58 us 39.394 us
6.917 us 4.822 us 9.411 us
2000x6 - M.transposeDst() 100 2 7.9916 ms
39.985 us 39.756 us 40.489 us
1.654 us 761 ns 2.806 us
2000x6 - M.transposeTilingSO() 100 2 7.3978 ms
37.474 us 36.778 us 38.652 us
4.513 us 3.168 us 7.121 us
2000x6 - M.transposeTiling() 100 2 7.178 ms
36.411 us 35.865 us 37.924 us
4.276 us 1.951 us 9.129 us
2000x6 - M.transposeOptim() 100 2 7.277 ms
36.246 us 35.745 us 37.156 us
3.344 us 2.111 us 5.064 us
10x64 - M.transposeSrc() 100 24 5.0568 ms
2.138 us 2.12 us 2.173 us
126 ns 78 ns 196 ns
10x64 - M.transposeDst() 100 25 5.2225 ms
2.174 us 2.129 us 2.247 us
287 ns 207 ns 435 ns
10x64 - M.transposeTilingSO() 100 24 5.1744 ms
2.096 us 2.063 us 2.147 us
204 ns 147 ns 295 ns
10x64 - M.transposeTiling() 100 24 5.04 ms
2.08 us 2.057 us 2.118 us
146 ns 97 ns 217 ns
10x64 - M.transposeOptim() 100 24 5.2056 ms
2.117 us 2.113 us 2.127 us
30 ns 14 ns 63 ns
100x64 - M.transposeSrc() 100 3 6.2262 ms
20.21 us 20.006 us 20.918 us
1.709 us 521 ns 3.853 us
100x64 - M.transposeDst() 100 3 6.7932 ms
22.555 us 22.212 us 23.831 us
3.008 us 715 ns 6.97 us
100x64 - M.transposeTilingSO() 100 3 6.1863 ms
20.533 us 20.274 us 21.039 us
1.772 us 1.03 us 2.847 us
100x64 - M.transposeTiling() 100 3 5.7456 ms
19.351 us 19.029 us 20.018 us
2.272 us 1.156 us 3.812 us
100x64 - M.transposeOptim() 100 3 5.8194 ms
19.032 us 18.765 us 19.671 us
1.983 us 1.034 us 4.028 us
500x64 - M.transposeSrc() 100 1 12.0583 ms
122.591 us 119.687 us 128.673 us
20.488 us 10.986 us 33.249 us
500x64 - M.transposeDst() 100 1 11.7043 ms
120.092 us 116.98 us 125.237 us
20.019 us 13.892 us 29.15 us
500x64 - M.transposeTilingSO() 100 1 10.5694 ms
103.439 us 102.182 us 105.384 us
7.837 us 5.641 us 10.791 us
500x64 - M.transposeTiling() 100 1 10.6368 ms
106.94 us 106.374 us 108.516 us
4.39 us 1.848 us 9.122 us
500x64 - M.transposeOptim() 100 1 11.805 ms
117.716 us 115.789 us 121.624 us
13.424 us 8.167 us 24.381 us
1000x64 - M.transposeSrc() 100 1 23.7876 ms
209.335 us 206.118 us 216.338 us
23.03 us 12.992 us 42.105 us
1000x64 - M.transposeDst() 100 1 25.8148 ms
262.432 us 259.313 us 270.463 us
23.63 us 11.117 us 47.815 us
1000x64 - M.transposeTilingSO() 100 1 20.7829 ms
211.797 us 209.307 us 216.176 us
16.435 us 10.925 us 25.933 us
1000x64 - M.transposeTiling() 100 1 19.5189 ms
193.527 us 190.805 us 200.469 us
20.697 us 9.885 us 42.525 us
1000x64 - M.transposeOptim() 100 1 20.3983 ms
203.244 us 201.917 us 206.257 us
9.698 us 5.094 us 19.272 us
2000x64 - M.transposeSrc() 100 1 41.493 ms
417.131 us 409.83 us 430.768 us
49.594 us 29.832 us 78.773 us
2000x64 - M.transposeDst() 100 1 53.0408 ms
609.291 us 589.343 us 638.272 us
121.407 us 90.333 us 164.889 us
2000x64 - M.transposeTilingSO() 100 1 47.7331 ms
438.563 us 434.518 us 444.161 us
24.072 us 18.853 us 32.667 us
2000x64 - M.transposeTiling() 100 1 39.2658 ms
645.482 us 608.959 us 690.002 us
204.912 us 170.205 us 254.469 us
2000x64 - M.transposeOptim() 100 1 43.9548 ms
413.109 us 410.035 us 417.682 us
18.913 us 13.653 us 25.959 us
6x10 - M.transposeSrc() 100 143 5.0479 ms
336 ns 333 ns 343 ns
22 ns 14 ns 34 ns
6x10 - M.transposeDst() 100 141 5.0478 ms
353 ns 350 ns 359 ns
19 ns 9 ns 33 ns
6x10 - M.transposeTilingSO() 100 144 5.04 ms
333 ns 330 ns 341 ns
25 ns 13 ns 51 ns
6x10 - M.transposeTiling() 100 145 5.0315 ms
374 ns 361 ns 393 ns
80 ns 63 ns 101 ns
6x10 - M.transposeOptim() 100 143 5.0336 ms
347 ns 344 ns 352 ns
21 ns 16 ns 31 ns
6x100 - M.transposeSrc() 100 26 5.174 ms
1.987 us 1.969 us 2.03 us
134 ns 66 ns 249 ns
6x100 - M.transposeDst() 100 25 5.095 ms
2.012 us 1.99 us 2.057 us
155 ns 94 ns 274 ns
6x100 - M.transposeTilingSO() 100 25 5.18 ms
2.01 us 1.978 us 2.071 us
218 ns 132 ns 354 ns
6x100 - M.transposeTiling() 100 26 5.1298 ms
1.995 us 1.974 us 2.035 us
141 ns 90 ns 251 ns
6x100 - M.transposeOptim() 100 25 5.1175 ms
2.032 us 2.004 us 2.083 us
188 ns 125 ns 304 ns
6x500 - M.transposeSrc() 100 6 5.6406 ms
9.49 us 9.374 us 9.709 us
785 ns 486 ns 1.264 us
6x500 - M.transposeDst() 100 5 5.324 ms
10.026 us 9.841 us 10.438 us
1.337 us 729 ns 2.446 us
6x500 - M.transposeTilingSO() 100 5 5.047 ms
9.932 us 9.839 us 10.186 us
717 ns 313 ns 1.472 us
6x500 - M.transposeTiling() 100 6 5.6376 ms
9.354 us 9.257 us 9.579 us
712 ns 344 ns 1.242 us
6x500 - M.transposeOptim() 100 6 5.9208 ms
9.864 us 9.773 us 10.077 us
669 ns 347 ns 1.238 us
6x1000 - M.transposeSrc() 100 3 6.9297 ms
33.555 us 33.057 us 34.385 us
3.202 us 2.206 us 5.248 us
6x1000 - M.transposeDst() 100 3 5.9598 ms
19.72 us 19.376 us 20.37 us
2.337 us 1.51 us 4.095 us
6x1000 - M.transposeTilingSO() 100 3 5.9967 ms
19.719 us 19.5 us 20.454 us
1.808 us 506 ns 3.985 us
6x1000 - M.transposeTiling() 100 3 6.1092 ms
20.358 us 20.093 us 20.826 us
1.757 us 1.125 us 2.672 us
6x1000 - M.transposeOptim() 100 3 5.8419 ms
19.59 us 19.282 us 20.231 us
2.182 us 1.054 us 3.667 us
6x2000 - M.transposeSrc() 100 2 8.1376 ms
41.004 us 40.358 us 42.238 us
4.417 us 2.737 us 7.489 us
6x2000 - M.transposeDst() 100 2 7.8642 ms
39.302 us 38.882 us 40.608 us
3.367 us 1.193 us 7.286 us
6x2000 - M.transposeTilingSO() 100 2 7.8902 ms
39.145 us 38.839 us 39.636 us
1.938 us 1.374 us 3.195 us
6x2000 - M.transposeTiling() 100 2 8.2668 ms
40.913 us 40.391 us 42.242 us
3.934 us 1.634 us 7.834 us
6x2000 - M.transposeOptim() 100 2 7.9046 ms
38.17 us 37.319 us 40.069 us
6.156 us 2.587 us 10.519 us
640x1000 - M.transposeSrc() 100 1 812.533 ms
8.17895 ms 8.14204 ms 8.23748 ms
232.495 us 159.016 us 357.593 us
640x1000 - M.transposeDst() 100 1 820.454 ms
8.21139 ms 8.18048 ms 8.24553 ms
166.413 us 146.152 us 192.761 us
640x1000 - M.transposeTilingSO() 100 1 551.423 ms
5.58898 ms 5.5692 ms 5.61138 ms
107.271 us 92.915 us 127.154 us
640x1000 - M.transposeTiling() 100 1 539.407 ms
5.56831 ms 5.4872 ms 5.71791 ms
543.574 us 340.069 us 929.592 us
640x1000 - M.transposeOptim() 100 1 565.028 ms
5.69277 ms 5.62122 ms 5.93922 ms
604.531 us 187.6 us 1.36717 ms
800x640 - M.transposeSrc() 100 1 541.873 ms
5.37478 ms 5.34312 ms 5.41696 ms
186.691 us 147.59 us 252.971 us
800x640 - M.transposeDst() 100 1 700.771 ms
6.99152 ms 6.96815 ms 7.01857 ms
128.46 us 106.449 us 161.095 us
800x640 - M.transposeTilingSO() 100 1 424.757 ms
4.42832 ms 4.36525 ms 4.61912 ms
507.931 us 193.717 us 1.10549 ms
800x640 - M.transposeTiling() 100 1 398.23 ms
4.04877 ms 4.01931 ms 4.09474 ms
184.445 us 133.435 us 303.767 us
800x640 - M.transposeOptim() 100 1 434.316 ms
4.36581 ms 4.31834 ms 4.46928 ms
340.103 us 154.871 us 586.834 us
640x500 - M.transposeSrc() 100 1 289.336 ms
3.32501 ms 3.22103 ms 3.45449 ms
588.368 us 505.994 us 739.506 us
640x500 - M.transposeDst() 100 1 646.412 ms
4.10666 ms 4.07737 ms 4.1477 ms
176.079 us 133.739 us 241.46 us
640x500 - M.transposeTilingSO() 100 1 313.07 ms
3.13385 ms 3.0495 ms 3.46781 ms
745.226 us 161.203 us 1.73092 ms
640x500 - M.transposeTiling() 100 1 279.971 ms
2.49484 ms 2.46777 ms 2.52341 ms
142.876 us 128.217 us 161.907 us
640x500 - M.transposeOptim() 100 1 292.134 ms
2.90779 ms 2.86965 ms 2.9782 ms
256.395 us 154.102 us 406.094 us
500x640 - M.transposeSrc() 100 1 371.786 ms
3.7282 ms 3.71351 ms 3.74981 ms
89.155 us 66.169 us 124.668 us
500x640 - M.transposeDst() 100 1 288.338 ms
2.92213 ms 2.90493 ms 2.94413 ms
98.936 us 81.054 us 128.952 us
500x640 - M.transposeTilingSO() 100 1 261.724 ms
2.60663 ms 2.59201 ms 2.62781 ms
88.934 us 67.512 us 144.408 us
500x640 - M.transposeTiling() 100 1 237.29 ms
2.38999 ms 2.37095 ms 2.41382 ms
108.116 us 90.8 us 133.621 us
500x640 - M.transposeOptim() 100 1 233.814 ms
2.39261 ms 2.37263 ms 2.43491 ms
142.436 us 82.648 us 279.288 us
640x837 - M.transposeSrc() 100 1 681.346 ms
6.7371 ms 6.71323 ms 6.77297 ms
146.595 us 108.376 us 220.458 us
640x837 - M.transposeDst() 100 1 720.69 ms
7.28952 ms 7.26036 ms 7.34868 ms
203.885 us 122.192 us 395.995 us
640x837 - M.transposeTilingSO() 100 1 595.576 ms
6.02544 ms 6.00303 ms 6.05725 ms
134.962 us 102.382 us 194.305 us
640x837 - M.transposeTiling() 100 1 439.721 ms
4.45634 ms 4.38057 ms 4.69761 ms
617.912 us 176.905 us 1.36078 ms
640x837 - M.transposeOptim() 100 1 594.039 ms
6.05717 ms 5.95265 ms 6.53803 ms
987.605 us 92.679 us 2.34693 ms
837x640 - M.transposeSrc() 100 1 615.768 ms
6.18926 ms 6.15459 ms 6.27144 ms
256.563 us 134.705 us 524.513 us
837x640 - M.transposeDst() 100 1 735.981 ms
7.38238 ms 7.35619 ms 7.41189 ms
141.714 us 120.568 us 166.691 us
837x640 - M.transposeTilingSO() 100 1 454.766 ms
4.50993 ms 4.48762 ms 4.55183 ms
151.788 us 97.711 us 281.057 us
837x640 - M.transposeTiling() 100 1 419.902 ms
4.21865 ms 4.18954 ms 4.26208 ms
179.769 us 129.087 us 267.846 us
837x640 - M.transposeOptim() 100 1 426.574 ms
4.26096 ms 4.17571 ms 4.59033 ms
788.384 us 102.962 us 1.86637 ms
===============================================================================
All tests passed (12802962 assertions in 1 test case)
benchmark name samples iterations estimated
mean low mean high mean
std dev low std dev high std dev
-------------------------------------------------------------------------------
701x1503 - M.transposeSrc() 100 1 357.183 ms
3.46899 ms 3.43505 ms 3.50642 ms
181.466 us 163.964 us 205.134 us
701x1503 - M.transposeDst() 100 1 353.707 ms
3.47051 ms 3.42374 ms 3.52123 ms
247.313 us 219.43 us 283.555 us
701x1503 - M.transposeTilingSO() 100 1 320.089 ms
3.22827 ms 3.17116 ms 3.29639 ms
317.913 us 278.554 us 369.349 us
701x1503 - M.transposeTiling() 100 1 259.815 ms
2.77319 ms 2.72398 ms 2.82446 ms
255.077 us 235.877 us 285.615 us
701x1503 - M.transposeOptim() 100 1 260.578 ms
2.71571 ms 2.66495 ms 2.77453 ms
278.662 us 243.735 us 332.941 us
1791x837 - M.transposeSrc() 100 1 568.204 ms
5.53622 ms 5.49697 ms 5.58348 ms
220.496 us 186.936 us 295.12 us
1791x837 - M.transposeDst() 100 1 811.527 ms
8.04237 ms 8.01087 ms 8.07725 ms
169.187 us 144.091 us 207.33 us
1791x837 - M.transposeTilingSO() 100 1 468.185 ms
4.52806 ms 4.48044 ms 4.57947 ms
251.963 us 228.824 us 283.423 us
1791x837 - M.transposeTiling() 100 1 412.859 ms
4.10997 ms 4.04017 ms 4.18759 ms
376.66 us 338.979 us 422.86 us
1791x837 - M.transposeOptim() 100 1 392.493 ms
4.13067 ms 4.07017 ms 4.19579 ms
321.46 us 292.933 us 355.637 us
1201x1201 - M.transposeSrc() 100 1 502.524 ms
4.92291 ms 4.88083 ms 4.97149 ms
229.526 us 198.987 us 272.446 us
1201x1201 - M.transposeDst() 100 1 554.466 ms
5.53983 ms 5.49058 ms 5.5966 ms
269.172 us 229.191 us 326.663 us
1201x1201 - M.transposeTilingSO() 100 1 433.002 ms
4.44824 ms 4.3968 ms 4.50724 ms
280.264 us 244.828 us 330.229 us
1201x1201 - M.transposeTiling() 100 1 399.054 ms
3.75262 ms 3.70161 ms 3.80912 ms
272.986 us 247.791 us 303.901 us
1201x1201 - M.transposeOptim() 100 1 397.614 ms
3.84316 ms 3.77968 ms 3.91483 ms
344.915 us 302.133 us 410.838 us
1024x1024 - M.transposeSrc() 100 1 541.995 ms
5.61084 ms 5.54408 ms 5.68312 ms
354.174 us 321.713 us 392.375 us
1024x1024 - M.transposeDst() 100 1 449.142 ms
4.29921 ms 4.2461 ms 4.36585 ms
303.533 us 250.368 us 382.601 us
1024x1024 - M.transposeTilingSO() 100 1 282.063 ms
2.74389 ms 2.67914 ms 2.82097 ms
359.898 us 312.862 us 418.617 us
1024x1024 - M.transposeTiling() 100 1 471.38 ms
4.78148 ms 4.73581 ms 4.8339 ms
248.556 us 215.324 us 301.822 us
1024x1024 - M.transposeOptim() 100 1 269.492 ms
2.72223 ms 2.67947 ms 2.77115 ms
232.913 us 203.185 us 266.345 us
2000x2000 - M.transposeSrc() 100 1 2.26015 s
22.3536 ms 22.2669 ms 22.4523 ms
473.398 us 414.672 us 554.137 us
2000x2000 - M.transposeDst() 100 1 2.38719 s
23.7494 ms 23.6728 ms 23.8316 ms
403.62 us 360.271 us 459.177 us
2000x2000 - M.transposeTilingSO() 100 1 1.17682 s
12.136 ms 12.068 ms 12.2013 ms
340.222 us 308.22 us 378.588 us
2000x2000 - M.transposeTiling() 100 1 1.18385 s
12.0904 ms 12.0238 ms 12.1592 ms
346.494 us 308.903 us 397.502 us
2000x2000 - M.transposeOptim() 100 1 1.20988 s
12.0665 ms 11.9959 ms 12.1398 ms
368.575 us 331.096 us 416.114 us
10x6 - M.transposeSrc() 100 128 2.048 ms
171 ns 171 ns 172 ns
2 ns 0 ns 4 ns
10x6 - M.transposeDst() 100 141 2.0445 ms
147 ns 147 ns 148 ns
1 ns 0 ns 3 ns
10x6 - M.transposeTilingSO() 100 129 2.0382 ms
160 ns 160 ns 161 ns
1 ns 0 ns 2 ns
10x6 - M.transposeTiling() 100 133 2.0482 ms
156 ns 156 ns 156 ns
0 ns 0 ns 1 ns
10x6 - M.transposeOptim() 100 133 2.0482 ms
150 ns 149 ns 150 ns
1 ns 0 ns 2 ns
100x6 - M.transposeSrc() 100 19 2.0501 ms
1.064 us 1.061 us 1.068 us
17 ns 12 ns 30 ns
100x6 - M.transposeDst() 100 25 2.0625 ms
799 ns 798 ns 803 ns
8 ns 0 ns 20 ns
100x6 - M.transposeTilingSO() 100 22 2.0636 ms
938 ns 936 ns 943 ns
13 ns 1 ns 28 ns
100x6 - M.transposeTiling() 100 22 2.1032 ms
959 ns 957 ns 965 ns
11 ns 1 ns 27 ns
100x6 - M.transposeOptim() 100 20 2.122 ms
1.073 us 1.072 us 1.078 us
10 ns 1 ns 25 ns
500x6 - M.transposeSrc() 100 5 2.497 ms
5.126 us 5.103 us 5.162 us
143 ns 100 ns 244 ns
500x6 - M.transposeDst() 100 6 2.2788 ms
3.861 us 3.76 us 4.183 us
817 ns 271 ns 1.791 us
500x6 - M.transposeTilingSO() 100 5 2.2025 ms
4.245 us 4.238 us 4.283 us
75 ns 4 ns 180 ns
500x6 - M.transposeTiling() 100 5 2.286 ms
4.393 us 4.385 us 4.433 us
80 ns 4 ns 191 ns
500x6 - M.transposeOptim() 100 4 2.0456 ms
4.93 us 4.917 us 4.967 us
101 ns 41 ns 221 ns
1000x6 - M.transposeSrc() 100 3 3.0453 ms
9.868 us 9.837 us 9.917 us
193 ns 127 ns 334 ns
1000x6 - M.transposeDst() 100 3 2.5881 ms
8.516 us 8.492 us 8.611 us
207 ns 14 ns 478 ns
1000x6 - M.transposeTilingSO() 100 3 2.5995 ms
8.731 us 8.705 us 8.822 us
204 ns 10 ns 463 ns
1000x6 - M.transposeTiling() 100 3 2.658 ms
9.543 us 9.152 us 10.529 us
2.785 us 290 ns 5.08 us
1000x6 - M.transposeOptim() 100 3 2.9787 ms
9.874 us 9.765 us 10.363 us
995 ns 132 ns 2.345 us
2000x6 - M.transposeSrc() 100 2 4.025 ms
20.42 us 20.027 us 21.552 us
3.104 us 1.248 us 6.61 us
2000x6 - M.transposeDst() 100 2 3.5252 ms
17.426 us 17.095 us 18.248 us
2.44 us 664 ns 4.363 us
2000x6 - M.transposeTilingSO() 100 2 3.5348 ms
17.287 us 17.213 us 17.438 us
517 ns 318 ns 990 ns
2000x6 - M.transposeTiling() 100 2 3.6406 ms
17.949 us 17.756 us 18.696 us
1.676 us 393 ns 3.856 us
2000x6 - M.transposeOptim() 100 2 4.04 ms
20.078 us 20.027 us 20.191 us
372 ns 201 ns 665 ns
10x64 - M.transposeSrc() 100 18 2.097 ms
1.164 us 1.161 us 1.173 us
24 ns 3 ns 46 ns
10x64 - M.transposeDst() 100 23 2.0907 ms
908 ns 906 ns 913 ns
13 ns 1 ns 26 ns
10x64 - M.transposeTilingSO() 100 20 2.08 ms
1.053 us 1.052 us 1.058 us
13 ns 2 ns 25 ns
10x64 - M.transposeTiling() 100 21 2.1105 ms
1.079 us 1.033 us 1.161 us
305 ns 196 ns 459 ns
10x64 - M.transposeOptim() 100 23 2.1229 ms
897 ns 895 ns 901 ns
12 ns 5 ns 27 ns
100x64 - M.transposeSrc() 100 2 2.3078 ms
11.234 us 11.206 us 11.314 us
214 ns 20 ns 451 ns
100x64 - M.transposeDst() 100 2 2.1264 ms
10.271 us 10.229 us 10.383 us
312 ns 31 ns 634 ns
100x64 - M.transposeTilingSO() 100 3 2.9781 ms
9.643 us 9.614 us 9.756 us
247 ns 15 ns 574 ns
100x64 - M.transposeTiling() 100 3 2.8785 ms
9.79 us 9.741 us 9.878 us
328 ns 218 ns 558 ns
100x64 - M.transposeOptim() 100 3 2.946 ms
9.86 us 9.826 us 9.948 us
246 ns 23 ns 480 ns
500x64 - M.transposeSrc() 100 1 5.8866 ms
59.45 us 59.302 us 60.007 us
1.275 us 319 ns 2.903 us
500x64 - M.transposeDst() 100 1 6.9596 ms
69.204 us 68.983 us 69.972 us
1.866 us 548 ns 4.22 us
500x64 - M.transposeTilingSO() 100 1 4.9472 ms
50.91 us 49.509 us 53.688 us
9.687 us 5.463 us 15.69 us
500x64 - M.transposeTiling() 100 1 4.8529 ms
47.505 us 47.249 us 48.545 us
2.271 us 378 ns 5.241 us
500x64 - M.transposeOptim() 100 1 5.9125 ms
58.801 us 58.094 us 60.208 us
4.889 us 2.854 us 8.003 us
1000x64 - M.transposeSrc() 100 1 11.0419 ms
111.456 us 110.932 us 112.431 us
3.532 us 2.271 us 6.113 us
1000x64 - M.transposeDst() 100 1 17.0416 ms
170.834 us 170.262 us 172.324 us
4.369 us 2.139 us 9.125 us
1000x64 - M.transposeTilingSO() 100 1 9.823 ms
101.293 us 100.45 us 103.571 us
6.502 us 2.678 us 13.594 us
1000x64 - M.transposeTiling() 100 1 9.0109 ms
92.09 us 91.258 us 94.558 us
6.581 us 1.879 us 14.07 us
1000x64 - M.transposeOptim() 100 1 11.0237 ms
111.293 us 109.605 us 114.179 us
11.05 us 7.378 us 16.659 us
2000x64 - M.transposeSrc() 100 1 22.8322 ms
227.92 us 224.329 us 233.785 us
23.126 us 16.069 us 32.818 us
2000x64 - M.transposeDst() 100 1 36.995 ms
363.675 us 359.523 us 369.659 us
25.172 us 19.141 us 34.099 us
2000x64 - M.transposeTilingSO() 100 1 20.4045 ms
204.861 us 203.054 us 209.214 us
13.514 us 5.392 us 26.409 us
2000x64 - M.transposeTiling() 100 1 18.8847 ms
189.95 us 188.249 us 193.99 us
12.54 us 5.765 us 24.877 us
2000x64 - M.transposeOptim() 100 1 22.7242 ms
229.34 us 227.362 us 232.385 us
12.285 us 9.268 us 19.122 us
6x10 - M.transposeSrc() 100 128 2.048 ms
173 ns 173 ns 175 ns
3 ns 0 ns 7 ns
6x10 - M.transposeDst() 100 138 2.0424 ms
152 ns 152 ns 152 ns
1 ns 0 ns 2 ns
6x10 - M.transposeTilingSO() 100 126 2.0538 ms
166 ns 165 ns 167 ns
3 ns 1 ns 7 ns
6x10 - M.transposeTiling() 100 128 2.0352 ms
160 ns 159 ns 160 ns
1 ns 0 ns 3 ns
6x10 - M.transposeOptim() 100 131 2.0436 ms
158 ns 154 ns 165 ns
28 ns 19 ns 38 ns
6x100 - M.transposeSrc() 100 19 2.0615 ms
1.101 us 1.096 us 1.105 us
23 ns 14 ns 34 ns
6x100 - M.transposeDst() 100 23 2.0861 ms
875 ns 874 ns 879 ns
9 ns 1 ns 22 ns
6x100 - M.transposeTilingSO() 100 20 2.074 ms
1.049 us 1.047 us 1.053 us
16 ns 10 ns 25 ns
6x100 - M.transposeTiling() 100 21 2.1042 ms
1.035 us 1.033 us 1.038 us
9 ns 5 ns 18 ns
6x100 - M.transposeOptim() 100 23 2.1091 ms
927 ns 926 ns 932 ns
12 ns 5 ns 27 ns
6x500 - M.transposeSrc() 100 5 2.5145 ms
4.836 us 4.817 us 4.868 us
123 ns 85 ns 197 ns
6x500 - M.transposeDst() 100 5 2.152 ms
4.175 us 4.165 us 4.204 us
71 ns 9 ns 156 ns
6x500 - M.transposeTilingSO() 100 5 2.5 ms
4.864 us 4.857 us 4.891 us
64 ns 5 ns 152 ns
6x500 - M.transposeTiling() 100 5 2.24 ms
4.533 us 4.462 us 4.777 us
607 ns 189 ns 1.375 us
6x500 - M.transposeOptim() 100 5 2.183 ms
4.377 us 4.362 us 4.404 us
98 ns 63 ns 184 ns
6x1000 - M.transposeSrc() 100 2 2.3416 ms
11.711 us 11.664 us 11.776 us
278 ns 218 ns 415 ns
6x1000 - M.transposeDst() 100 3 2.5656 ms
8.586 us 8.531 us 8.777 us
455 ns 90 ns 1.014 us
6x1000 - M.transposeTilingSO() 100 3 2.9394 ms
9.903 us 9.883 us 9.969 us
157 ns 8 ns 349 ns
6x1000 - M.transposeTiling() 100 2 2.137 ms
10.714 us 10.678 us 10.801 us
270 ns 132 ns 566 ns
6x1000 - M.transposeOptim() 100 3 2.6178 ms
8.576 us 8.557 us 8.642 us
161 ns 54 ns 362 ns
6x2000 - M.transposeSrc() 100 1 2.4143 ms
24.308 us 24.179 us 24.712 us
1.029 us 245 ns 2.235 us
6x2000 - M.transposeDst() 100 2 3.4362 ms
16.733 us 16.69 us 16.852 us
337 ns 121 ns 714 ns
6x2000 - M.transposeTilingSO() 100 2 3.9948 ms
19.536 us 19.35 us 20.299 us
1.652 us 270 ns 3.856 us
6x2000 - M.transposeTiling() 100 1 2.2519 ms
22.449 us 22.106 us 23.324 us
2.584 us 779 ns 4.813 us
6x2000 - M.transposeOptim() 100 2 3.5188 ms
17.554 us 17.371 us 18.236 us
1.586 us 415 ns 3.629 us
640x1000 - M.transposeSrc() 100 1 289.281 ms
2.8153 ms 2.78917 ms 2.84514 ms
143.091 us 126.887 us 159.577 us
640x1000 - M.transposeDst() 100 1 171.014 ms
1.60794 ms 1.57746 ms 1.64136 ms
162.392 us 146.745 us 194.924 us
640x1000 - M.transposeTilingSO() 100 1 152.985 ms
1.49469 ms 1.47102 ms 1.5226 ms
131.178 us 114.096 us 150.067 us
640x1000 - M.transposeTiling() 100 1 149.863 ms
1.52566 ms 1.49141 ms 1.56711 ms
191.479 us 164.621 us 226.526 us
640x1000 - M.transposeOptim() 100 1 148.478 ms
1.54903 ms 1.5167 ms 1.58977 ms
185.29 us 155.03 us 227.128 us
800x640 - M.transposeSrc() 100 1 169.205 ms
1.73036 ms 1.70934 ms 1.75401 ms
114.835 us 101.702 us 129.548 us
800x640 - M.transposeDst() 100 1 162.073 ms
1.66443 ms 1.61781 ms 1.71729 ms
252.499 us 223.028 us 284.325 us
800x640 - M.transposeTilingSO() 100 1 113.913 ms
1.19372 ms 1.16122 ms 1.23329 ms
181.937 us 155.571 us 220.84 us
800x640 - M.transposeTiling() 100 1 109.643 ms
1.13409 ms 1.10902 ms 1.16273 ms
137.122 us 121 us 155.28 us
800x640 - M.transposeOptim() 100 1 114.597 ms
1.171 ms 1.14468 ms 1.20163 ms
145.198 us 128.907 us 162.895 us
640x500 - M.transposeSrc() 100 1 132.556 ms
1.34859 ms 1.33202 ms 1.36889 ms
93.265 us 78.314 us 119.684 us
640x500 - M.transposeDst() 100 1 63.4138 ms
646.873 us 629.822 us 670.009 us
100.843 us 81.151 us 140.241 us
640x500 - M.transposeTilingSO() 100 1 63.1857 ms
663.314 us 642.978 us 686.554 us
110.759 us 98.833 us 122.304 us
640x500 - M.transposeTiling() 100 1 62.2165 ms
634.846 us 618.136 us 656.279 us
96.002 us 79.693 us 120.969 us
640x500 - M.transposeOptim() 100 1 64.32 ms
643.628 us 623.666 us 666.202 us
107.93 us 95.638 us 120.288 us
500x640 - M.transposeSrc() 100 1 78.2872 ms
766.578 us 755.671 us 780.743 us
62.735 us 51.324 us 81.669 us
500x640 - M.transposeDst() 100 1 83.5064 ms
796.774 us 783.67 us 813.958 us
76.212 us 62.608 us 100.141 us
500x640 - M.transposeTilingSO() 100 1 60.4457 ms
586.66 us 575.191 us 600.019 us
63.125 us 55.79 us 80.027 us
500x640 - M.transposeTiling() 100 1 60.3372 ms
587.566 us 580.262 us 597.417 us
42.86 us 34.976 us 62.777 us
500x640 - M.transposeOptim() 100 1 59.5159 ms
596.746 us 586.509 us 609.393 us
57.613 us 49.225 us 71.918 us
640x837 - M.transposeSrc() 100 1 236.482 ms
2.34727 ms 2.33144 ms 2.36639 ms
88.172 us 74.732 us 105.038 us
640x837 - M.transposeDst() 100 1 148.285 ms
1.39128 ms 1.35915 ms 1.42674 ms
172.051 us 155.703 us 189.89 us
640x837 - M.transposeTilingSO() 100 1 130.99 ms
1.26723 ms 1.24334 ms 1.29448 ms
130.094 us 115.504 us 155.898 us
640x837 - M.transposeTiling() 100 1 123.241 ms
1.15381 ms 1.13562 ms 1.17656 ms
103.489 us 86.369 us 125.177 us
640x837 - M.transposeOptim() 100 1 131.093 ms
1.2513 ms 1.22887 ms 1.27694 ms
122.607 us 108.589 us 138.794 us
837x640 - M.transposeSrc() 100 1 141.651 ms
1.36352 ms 1.34847 ms 1.38188 ms
84.671 us 72.238 us 99.24 us
837x640 - M.transposeDst() 100 1 175.583 ms
1.65168 ms 1.6168 ms 1.69022 ms
187.367 us 167.584 us 207.404 us
837x640 - M.transposeTilingSO() 100 1 119.761 ms
1.14924 ms 1.12977 ms 1.1717 ms
106.828 us 94.149 us 121.626 us
837x640 - M.transposeTiling() 100 1 111.963 ms
1.06767 ms 1.05048 ms 1.08961 ms
98.186 us 81.284 us 122.473 us
837x640 - M.transposeOptim() 100 1 111.887 ms
1.06025 ms 1.04424 ms 1.07948 ms
89.609 us 77.382 us 103.938 us
===============================================================================
All tests passed (26448914 assertions in 1 test case)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment