-
-
Save catree/33942c1b4dd5ed2581e29de147030a1a to your computer and use it in GitHub Desktop.
Cache-friendly matrix transpose
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define CATCH_CONFIG_ENABLE_BENCHMARKING | |
#define CATCH_CONFIG_RUNNER | |
#include "catch.hpp" | |
namespace { | |
bool equal(double x, double y, double tol) | |
{ | |
return std::fabs(x - y) < tol; | |
} | |
class MatrixXd { | |
public: | |
MatrixXd() : | |
m_data(), m_rows(0), m_cols(0) {} | |
MatrixXd(int row, int col) : | |
m_data(row*col), m_rows(row), m_cols(col) {} | |
double& operator() (int row, int col) | |
{ | |
return m_data[row*m_cols + col]; | |
} | |
double operator() (int row, int col) const | |
{ | |
return m_data[row*m_cols + col]; | |
} | |
MatrixXd transposeSrc() const | |
{ | |
MatrixXd At(m_cols, m_rows); | |
for (int i = 0; i < m_rows; i++) { | |
for (int j = 0; j < m_cols; j++) { | |
At(j, i) = (*this)(i, j); | |
} | |
} | |
return At; | |
} | |
MatrixXd transposeDst() const | |
{ | |
MatrixXd At(m_cols, m_rows); | |
for (int i = 0; i < m_cols; i++) { | |
for (int j = 0; j < m_rows; j++) { | |
At(i, j) = (*this)(j, i); | |
} | |
} | |
return At; | |
} | |
MatrixXd transposeTiling(int tileSize = 16) const | |
{ | |
MatrixXd At(m_cols, m_rows); | |
for (int i = 0; i < m_rows;) { | |
for (; i <= m_rows - tileSize; i += tileSize) { | |
int j = 0; | |
for (; j <= m_cols - tileSize; j += tileSize) { | |
for (int k = i; k < i + tileSize; k++) { | |
for (int l = j; l < j + tileSize; l++) { | |
At(l, k) = (*this)(k, l); | |
} | |
} | |
} | |
for (int k = i; k < i + tileSize; k++) { | |
for (int l = j; l < m_cols; l++) { | |
At(l, k) = (*this)(k, l); | |
} | |
} | |
} | |
for (; i < m_rows; i++) { | |
for (int j = 0; j < m_cols; j++) { | |
At(j, i) = (*this)(i, j); | |
} | |
} | |
} | |
return At; | |
} | |
MatrixXd transposeTilingSO(int tileSize = 16) const | |
{ | |
MatrixXd out(m_cols, m_rows); | |
for (int i = 0; i < m_rows; i += tileSize) { | |
for (int j = 0; j < m_cols; ++j) { | |
for (int b = 0; b < tileSize && i + b < m_rows; ++b) { | |
out.m_data[j*m_rows + i + b] = (*this).m_data[(i + b)*m_cols + j]; | |
} | |
} | |
} | |
return out; | |
} | |
MatrixXd transposeOptim(int tileSize = 16) const | |
{ | |
if (m_rows > 2 * m_cols && m_cols <= 64) { | |
return transposeSrc(); | |
} else if (m_cols > 2 * m_rows && m_rows <= 64) { | |
return transposeDst(); | |
} else if (m_rows % tileSize == 0) { | |
return transposeTilingSO(); | |
} else { | |
return transposeTiling(); | |
} | |
} | |
bool operator==(const MatrixXd& b) const | |
{ | |
if (b.m_rows != m_rows || b.m_cols != m_cols) { | |
return false; | |
} | |
for (int i = 0; i < m_rows; i++) { | |
for (int j = 0; j < m_cols; j++) { | |
if (!equal((*this)(i, j), b(i, j), std::numeric_limits<double>::epsilon())) { | |
return false; | |
} | |
} | |
} | |
return true; | |
} | |
std::vector<double> m_data; | |
int m_rows; | |
int m_cols; | |
}; | |
MatrixXd generateMatrix(int sz1, int sz2) { | |
MatrixXd M(sz1, sz2); | |
for (int i = 0; i < M.m_rows; i++) { | |
for (int j = 0; j < M.m_cols; j++) { | |
M(i, j) = i * M.m_cols + j; | |
} | |
} | |
return M; | |
} | |
MatrixXd generateMatrixTranspose(int sz1, int sz2) { | |
MatrixXd M(sz2, sz1); | |
for (int j = 0; j < M.m_cols; j++) { | |
for (int i = 0; i < M.m_rows; i++) { | |
M(i, j) = j * M.m_rows + i; | |
} | |
} | |
return M; | |
} | |
TEST_CASE("Benchmark matrix transpose", "[benchmark]") { | |
const std::vector<std::pair<int, int>> sizes = { {701, 1503}, {1791, 837}, {1201, 1201}, {1024, 1024}, {2000, 2000}, | |
{10, 6}, {100, 6}, {500, 6}, {1000, 6}, {2000, 6}, | |
{10, 64}, {100, 64}, {500, 64}, {1000, 64}, {2000, 64}, | |
{6, 10}, {6, 100}, {6, 500}, {6, 1000}, {6, 2000}, | |
{640, 1000}, {800, 640}, {640, 500}, {500, 640}, {640, 837}, {837, 640} | |
}; | |
for (auto sz : sizes) { | |
MatrixXd M = generateMatrix(sz.first, sz.second); | |
MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second); | |
std::ostringstream oss; | |
oss << sz.first << "x" << sz.second; | |
oss << " - M.transposeSrc()"; | |
BENCHMARK(oss.str().c_str()) { | |
MatrixXd Mt = M.transposeSrc(); | |
REQUIRE(Mt == Mt_true); | |
return Mt; | |
}; | |
oss.str(""); | |
oss << sz.first << "x" << sz.second; | |
oss << " - M.transposeDst()"; | |
BENCHMARK(oss.str().c_str()) { | |
MatrixXd Mt = M.transposeDst(); | |
REQUIRE(Mt == Mt_true); | |
return Mt; | |
}; | |
oss.str(""); | |
oss << sz.first << "x" << sz.second; | |
oss << " - M.transposeTilingSO()"; | |
BENCHMARK(oss.str().c_str()) { | |
MatrixXd Mt = M.transposeTilingSO(); | |
REQUIRE(Mt == Mt_true); | |
return Mt; | |
}; | |
oss.str(""); | |
oss << sz.first << "x" << sz.second; | |
oss << " - M.transposeTiling()"; | |
BENCHMARK(oss.str().c_str()) { | |
MatrixXd Mt = M.transposeTiling(); | |
REQUIRE(Mt == Mt_true); | |
return Mt; | |
}; | |
oss.str(""); | |
oss << sz.first << "x" << sz.second; | |
oss << " - M.transposeOptim()"; | |
BENCHMARK(oss.str().c_str()) { | |
MatrixXd Mt = M.transposeOptim(); | |
REQUIRE(Mt == Mt_true); | |
return Mt; | |
}; | |
} | |
//for (auto sz : sizes) { | |
// MatrixXd M = generateMatrix(sz.first, sz.second); | |
// MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second); | |
// std::ostringstream oss; | |
// oss << sz.first << "x" << sz.second; | |
// oss << " - M.transposeSrc()"; | |
// BENCHMARK(oss.str().c_str()) { | |
// MatrixXd Mt = M.transposeSrc(); | |
// REQUIRE(Mt == Mt_true); | |
// return Mt; | |
// }; | |
//} | |
//for (auto sz : sizes) { | |
// MatrixXd M = generateMatrix(sz.first, sz.second); | |
// MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second); | |
// std::ostringstream oss; | |
// oss << sz.first << "x" << sz.second; | |
// oss << " - M.transposeDst()"; | |
// BENCHMARK(oss.str().c_str()) { | |
// MatrixXd Mt = M.transposeDst(); | |
// REQUIRE(Mt == Mt_true); | |
// return Mt; | |
// }; | |
//} | |
//for (auto sz : sizes) { | |
// MatrixXd M = generateMatrix(sz.first, sz.second); | |
// MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second); | |
// std::ostringstream oss; | |
// oss << sz.first << "x" << sz.second; | |
// oss << " - M.transposeTilingSO()"; | |
// BENCHMARK(oss.str().c_str()) { | |
// MatrixXd Mt = M.transposeTilingSO(); | |
// REQUIRE(Mt == Mt_true); | |
// return Mt; | |
// }; | |
//} | |
//for (auto sz : sizes) { | |
// MatrixXd M = generateMatrix(sz.first, sz.second); | |
// MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second); | |
// std::ostringstream oss; | |
// oss << sz.first << "x" << sz.second; | |
// oss << " - M.transposeTiling()"; | |
// BENCHMARK(oss.str().c_str()) { | |
// MatrixXd Mt = M.transposeTiling(); | |
// REQUIRE(Mt == Mt_true); | |
// return Mt; | |
// }; | |
//} | |
//for (auto sz : sizes) { | |
// MatrixXd M = generateMatrix(sz.first, sz.second); | |
// MatrixXd Mt_true = generateMatrixTranspose(sz.first, sz.second); | |
// std::ostringstream oss; | |
// oss << sz.first << "x" << sz.second; | |
// oss << " - M.transposeOptim()"; | |
// BENCHMARK(oss.str().c_str()) { | |
// MatrixXd Mt = M.transposeOptim(); | |
// REQUIRE(Mt == Mt_true); | |
// return Mt; | |
// }; | |
//} | |
} | |
} | |
int main(int argc, char *argv[]) | |
{ | |
Catch::Session session; // There must be exactly one instance | |
// Let Catch (using Clara) parse the command line | |
session.applyCommandLine(argc, argv); | |
int numFailed = session.run(); | |
// numFailed is clamped to 255 as some unices only use the lower 8 bits. | |
// This clamping has already been applied, so just return it here | |
// You can also do any post run clean-up here | |
return numFailed; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
benchmark name samples iterations estimated | |
mean low mean high mean | |
std dev low std dev high std dev | |
------------------------------------------------------------------------------- | |
701x1503 - M.transposeSrc() 100 1 1.33581 s | |
13.5217 ms 13.4686 ms 13.6241 ms | |
363.282 us 205.441 us 614.763 us | |
701x1503 - M.transposeDst() 100 1 1.46756 s | |
14.9272 ms 14.8749 ms 14.986 ms | |
282.704 us 246.542 us 330.304 us | |
701x1503 - M.transposeTilingSO() 100 1 1.20588 s | |
12.1736 ms 12.0986 ms 12.4693 ms | |
674.628 us 130.473 us 1.58296 ms | |
701x1503 - M.transposeTiling() 100 1 879.485 ms | |
8.80787 ms 8.7821 ms 8.8395 ms | |
145.482 us 120.471 us 177.312 us | |
701x1503 - M.transposeOptim() 100 1 875.261 ms | |
8.75884 ms 8.73439 ms 8.79353 ms | |
147.051 us 113.127 us 216.557 us | |
1791x837 - M.transposeSrc() 100 1 2.00429 s | |
20.1265 ms 20.0448 ms 20.3114 ms | |
593.232 us 224.222 us 1.04084 ms | |
1791x837 - M.transposeDst() 100 1 2.18822 s | |
21.9146 ms 21.8052 ms 22.0655 ms | |
648.663 us 507.725 us 852.9 us | |
1791x837 - M.transposeTilingSO() 100 1 1.7262 s | |
17.3702 ms 17.3125 ms 17.4566 ms | |
354.908 us 261.148 us 528.674 us | |
1791x837 - M.transposeTiling() 100 1 1.29939 s | |
12.8264 ms 12.7865 ms 12.8779 ms | |
231.128 us 185.655 us 303.622 us | |
1791x837 - M.transposeOptim() 100 1 1.27144 s | |
12.7916 ms 12.7492 ms 12.8483 ms | |
249.525 us 195.995 us 342.697 us | |
1201x1201 - M.transposeSrc() 100 1 1.8964 s | |
18.9402 ms 18.8951 ms 19.0037 ms | |
269.819 us 206.147 us 384.575 us | |
1201x1201 - M.transposeDst() 100 1 2.08132 s | |
20.8071 ms 20.7606 ms 20.8678 ms | |
269.753 us 217.186 us 350.209 us | |
1201x1201 - M.transposeTilingSO() 100 1 1.69206 s | |
16.4527 ms 16.3729 ms 16.7379 ms | |
686.675 us 194.436 us 1.57422 ms | |
1201x1201 - M.transposeTiling() 100 1 1.24112 s | |
12.382 ms 12.3463 ms 12.4389 ms | |
225.193 us 161.788 us 367.464 us | |
1201x1201 - M.transposeOptim() 100 1 1.25534 s | |
12.5325 ms 12.491 ms 12.5977 ms | |
262.328 us 189.402 us 448.495 us | |
1024x1024 - M.transposeSrc() 100 1 1.4795 s | |
14.5538 ms 14.4798 ms 14.6498 ms | |
426.128 us 343.01 us 538.435 us | |
1024x1024 - M.transposeDst() 100 1 1.78832 s | |
17.6619 ms 17.5781 ms 17.8555 ms | |
612.149 us 332.978 us 1.2434 ms | |
1024x1024 - M.transposeTilingSO() 100 1 827.986 ms | |
8.3198 ms 8.25111 ms 8.56124 ms | |
587.589 us 180.224 us 1.33455 ms | |
1024x1024 - M.transposeTiling() 100 1 1.0355 s | |
10.3063 ms 10.2595 ms 10.4512 ms | |
379.301 us 153.84 us 831.083 us | |
1024x1024 - M.transposeOptim() 100 1 831.858 ms | |
8.45701 ms 8.32702 ms 8.6856 ms | |
854.014 us 546.172 us 1.22753 ms | |
2000x2000 - M.transposeSrc() 100 1 5.76389 s | |
57.6496 ms 57.5226 ms 57.7993 ms | |
699.662 us 601.632 us 841.167 us | |
2000x2000 - M.transposeDst() 100 1 6.0115 s | |
60.5116 ms 60.264 ms 60.958 ms | |
1.65362 ms 1.09742 ms 2.96921 ms | |
2000x2000 - M.transposeTilingSO() 100 1 3.54605 s | |
35.3772 ms 35.3106 ms 35.4719 ms | |
402.03 us 308.713 us 620.57 us | |
2000x2000 - M.transposeTiling() 100 1 3.63679 s | |
36.0236 ms 35.7791 ms 36.4135 ms | |
1.5485 ms 1.09031 ms 2.21434 ms | |
2000x2000 - M.transposeOptim() 100 1 3.63542 s | |
36.1886 ms 35.9014 ms 36.6428 ms | |
1.81409 ms 1.29751 ms 2.72564 ms | |
10x6 - M.transposeSrc() 100 147 5.0274 ms | |
343 ns 336 ns 353 ns | |
42 ns 29 ns 64 ns | |
10x6 - M.transposeDst() 100 144 5.04 ms | |
347 ns 343 ns 355 ns | |
27 ns 18 ns 46 ns | |
10x6 - M.transposeTilingSO() 100 118 5.0268 ms | |
450 ns 443 ns 460 ns | |
44 ns 33 ns 57 ns | |
10x6 - M.transposeTiling() 100 145 5.0315 ms | |
345 ns 342 ns 350 ns | |
17 ns 10 ns 34 ns | |
10x6 - M.transposeOptim() 100 142 5.0268 ms | |
345 ns 341 ns 353 ns | |
30 ns 19 ns 48 ns | |
100x6 - M.transposeSrc() 100 26 5.1454 ms | |
1.896 us 1.867 us 1.945 us | |
188 ns 134 ns 277 ns | |
100x6 - M.transposeDst() 100 24 5.0472 ms | |
2.098 us 2.073 us 2.144 us | |
168 ns 110 ns 270 ns | |
100x6 - M.transposeTilingSO() 100 26 5.2208 ms | |
1.987 us 1.972 us 2.015 us | |
101 ns 64 ns 168 ns | |
100x6 - M.transposeTiling() 100 26 5.1038 ms | |
1.966 us 1.945 us 2.018 us | |
160 ns 53 ns 281 ns | |
100x6 - M.transposeOptim() 100 26 5.109 ms | |
1.923 us 1.901 us 1.957 us | |
136 ns 96 ns 201 ns | |
500x6 - M.transposeSrc() 100 6 5.6262 ms | |
9.285 us 9.112 us 9.558 us | |
1.089 us 768 ns 1.543 us | |
500x6 - M.transposeDst() 100 6 5.9478 ms | |
9.959 us 9.784 us 10.226 us | |
1.095 us 782 ns 1.468 us | |
500x6 - M.transposeTilingSO() 100 6 5.7468 ms | |
9.431 us 9.28 us 9.773 us | |
1.098 us 608 ns 2.182 us | |
500x6 - M.transposeTiling() 100 6 5.589 ms | |
9.401 us 9.228 us 9.713 us | |
1.147 us 762 ns 1.813 us | |
500x6 - M.transposeOptim() 100 6 5.5716 ms | |
9.204 us 9.061 us 9.437 us | |
914 ns 645 ns 1.384 us | |
1000x6 - M.transposeSrc() 100 3 5.5023 ms | |
17.883 us 17.613 us 18.513 us | |
1.979 us 1.061 us 3.911 us | |
1000x6 - M.transposeDst() 100 3 5.9508 ms | |
19.244 us 18.878 us 20.101 us | |
2.696 us 1.43 us 5.379 us | |
1000x6 - M.transposeTilingSO() 100 3 5.5887 ms | |
19.032 us 18.709 us 19.69 us | |
2.26 us 1.233 us 3.69 us | |
1000x6 - M.transposeTiling() 100 3 5.4729 ms | |
20.114 us 19.272 us 21.402 us | |
5.223 us 3.888 us 7.422 us | |
1000x6 - M.transposeOptim() 100 3 5.4549 ms | |
18.294 us 17.996 us 18.954 us | |
2.146 us 1.134 us 3.901 us | |
2000x6 - M.transposeSrc() 100 2 7.1956 ms | |
37.673 us 36.58 us 39.394 us | |
6.917 us 4.822 us 9.411 us | |
2000x6 - M.transposeDst() 100 2 7.9916 ms | |
39.985 us 39.756 us 40.489 us | |
1.654 us 761 ns 2.806 us | |
2000x6 - M.transposeTilingSO() 100 2 7.3978 ms | |
37.474 us 36.778 us 38.652 us | |
4.513 us 3.168 us 7.121 us | |
2000x6 - M.transposeTiling() 100 2 7.178 ms | |
36.411 us 35.865 us 37.924 us | |
4.276 us 1.951 us 9.129 us | |
2000x6 - M.transposeOptim() 100 2 7.277 ms | |
36.246 us 35.745 us 37.156 us | |
3.344 us 2.111 us 5.064 us | |
10x64 - M.transposeSrc() 100 24 5.0568 ms | |
2.138 us 2.12 us 2.173 us | |
126 ns 78 ns 196 ns | |
10x64 - M.transposeDst() 100 25 5.2225 ms | |
2.174 us 2.129 us 2.247 us | |
287 ns 207 ns 435 ns | |
10x64 - M.transposeTilingSO() 100 24 5.1744 ms | |
2.096 us 2.063 us 2.147 us | |
204 ns 147 ns 295 ns | |
10x64 - M.transposeTiling() 100 24 5.04 ms | |
2.08 us 2.057 us 2.118 us | |
146 ns 97 ns 217 ns | |
10x64 - M.transposeOptim() 100 24 5.2056 ms | |
2.117 us 2.113 us 2.127 us | |
30 ns 14 ns 63 ns | |
100x64 - M.transposeSrc() 100 3 6.2262 ms | |
20.21 us 20.006 us 20.918 us | |
1.709 us 521 ns 3.853 us | |
100x64 - M.transposeDst() 100 3 6.7932 ms | |
22.555 us 22.212 us 23.831 us | |
3.008 us 715 ns 6.97 us | |
100x64 - M.transposeTilingSO() 100 3 6.1863 ms | |
20.533 us 20.274 us 21.039 us | |
1.772 us 1.03 us 2.847 us | |
100x64 - M.transposeTiling() 100 3 5.7456 ms | |
19.351 us 19.029 us 20.018 us | |
2.272 us 1.156 us 3.812 us | |
100x64 - M.transposeOptim() 100 3 5.8194 ms | |
19.032 us 18.765 us 19.671 us | |
1.983 us 1.034 us 4.028 us | |
500x64 - M.transposeSrc() 100 1 12.0583 ms | |
122.591 us 119.687 us 128.673 us | |
20.488 us 10.986 us 33.249 us | |
500x64 - M.transposeDst() 100 1 11.7043 ms | |
120.092 us 116.98 us 125.237 us | |
20.019 us 13.892 us 29.15 us | |
500x64 - M.transposeTilingSO() 100 1 10.5694 ms | |
103.439 us 102.182 us 105.384 us | |
7.837 us 5.641 us 10.791 us | |
500x64 - M.transposeTiling() 100 1 10.6368 ms | |
106.94 us 106.374 us 108.516 us | |
4.39 us 1.848 us 9.122 us | |
500x64 - M.transposeOptim() 100 1 11.805 ms | |
117.716 us 115.789 us 121.624 us | |
13.424 us 8.167 us 24.381 us | |
1000x64 - M.transposeSrc() 100 1 23.7876 ms | |
209.335 us 206.118 us 216.338 us | |
23.03 us 12.992 us 42.105 us | |
1000x64 - M.transposeDst() 100 1 25.8148 ms | |
262.432 us 259.313 us 270.463 us | |
23.63 us 11.117 us 47.815 us | |
1000x64 - M.transposeTilingSO() 100 1 20.7829 ms | |
211.797 us 209.307 us 216.176 us | |
16.435 us 10.925 us 25.933 us | |
1000x64 - M.transposeTiling() 100 1 19.5189 ms | |
193.527 us 190.805 us 200.469 us | |
20.697 us 9.885 us 42.525 us | |
1000x64 - M.transposeOptim() 100 1 20.3983 ms | |
203.244 us 201.917 us 206.257 us | |
9.698 us 5.094 us 19.272 us | |
2000x64 - M.transposeSrc() 100 1 41.493 ms | |
417.131 us 409.83 us 430.768 us | |
49.594 us 29.832 us 78.773 us | |
2000x64 - M.transposeDst() 100 1 53.0408 ms | |
609.291 us 589.343 us 638.272 us | |
121.407 us 90.333 us 164.889 us | |
2000x64 - M.transposeTilingSO() 100 1 47.7331 ms | |
438.563 us 434.518 us 444.161 us | |
24.072 us 18.853 us 32.667 us | |
2000x64 - M.transposeTiling() 100 1 39.2658 ms | |
645.482 us 608.959 us 690.002 us | |
204.912 us 170.205 us 254.469 us | |
2000x64 - M.transposeOptim() 100 1 43.9548 ms | |
413.109 us 410.035 us 417.682 us | |
18.913 us 13.653 us 25.959 us | |
6x10 - M.transposeSrc() 100 143 5.0479 ms | |
336 ns 333 ns 343 ns | |
22 ns 14 ns 34 ns | |
6x10 - M.transposeDst() 100 141 5.0478 ms | |
353 ns 350 ns 359 ns | |
19 ns 9 ns 33 ns | |
6x10 - M.transposeTilingSO() 100 144 5.04 ms | |
333 ns 330 ns 341 ns | |
25 ns 13 ns 51 ns | |
6x10 - M.transposeTiling() 100 145 5.0315 ms | |
374 ns 361 ns 393 ns | |
80 ns 63 ns 101 ns | |
6x10 - M.transposeOptim() 100 143 5.0336 ms | |
347 ns 344 ns 352 ns | |
21 ns 16 ns 31 ns | |
6x100 - M.transposeSrc() 100 26 5.174 ms | |
1.987 us 1.969 us 2.03 us | |
134 ns 66 ns 249 ns | |
6x100 - M.transposeDst() 100 25 5.095 ms | |
2.012 us 1.99 us 2.057 us | |
155 ns 94 ns 274 ns | |
6x100 - M.transposeTilingSO() 100 25 5.18 ms | |
2.01 us 1.978 us 2.071 us | |
218 ns 132 ns 354 ns | |
6x100 - M.transposeTiling() 100 26 5.1298 ms | |
1.995 us 1.974 us 2.035 us | |
141 ns 90 ns 251 ns | |
6x100 - M.transposeOptim() 100 25 5.1175 ms | |
2.032 us 2.004 us 2.083 us | |
188 ns 125 ns 304 ns | |
6x500 - M.transposeSrc() 100 6 5.6406 ms | |
9.49 us 9.374 us 9.709 us | |
785 ns 486 ns 1.264 us | |
6x500 - M.transposeDst() 100 5 5.324 ms | |
10.026 us 9.841 us 10.438 us | |
1.337 us 729 ns 2.446 us | |
6x500 - M.transposeTilingSO() 100 5 5.047 ms | |
9.932 us 9.839 us 10.186 us | |
717 ns 313 ns 1.472 us | |
6x500 - M.transposeTiling() 100 6 5.6376 ms | |
9.354 us 9.257 us 9.579 us | |
712 ns 344 ns 1.242 us | |
6x500 - M.transposeOptim() 100 6 5.9208 ms | |
9.864 us 9.773 us 10.077 us | |
669 ns 347 ns 1.238 us | |
6x1000 - M.transposeSrc() 100 3 6.9297 ms | |
33.555 us 33.057 us 34.385 us | |
3.202 us 2.206 us 5.248 us | |
6x1000 - M.transposeDst() 100 3 5.9598 ms | |
19.72 us 19.376 us 20.37 us | |
2.337 us 1.51 us 4.095 us | |
6x1000 - M.transposeTilingSO() 100 3 5.9967 ms | |
19.719 us 19.5 us 20.454 us | |
1.808 us 506 ns 3.985 us | |
6x1000 - M.transposeTiling() 100 3 6.1092 ms | |
20.358 us 20.093 us 20.826 us | |
1.757 us 1.125 us 2.672 us | |
6x1000 - M.transposeOptim() 100 3 5.8419 ms | |
19.59 us 19.282 us 20.231 us | |
2.182 us 1.054 us 3.667 us | |
6x2000 - M.transposeSrc() 100 2 8.1376 ms | |
41.004 us 40.358 us 42.238 us | |
4.417 us 2.737 us 7.489 us | |
6x2000 - M.transposeDst() 100 2 7.8642 ms | |
39.302 us 38.882 us 40.608 us | |
3.367 us 1.193 us 7.286 us | |
6x2000 - M.transposeTilingSO() 100 2 7.8902 ms | |
39.145 us 38.839 us 39.636 us | |
1.938 us 1.374 us 3.195 us | |
6x2000 - M.transposeTiling() 100 2 8.2668 ms | |
40.913 us 40.391 us 42.242 us | |
3.934 us 1.634 us 7.834 us | |
6x2000 - M.transposeOptim() 100 2 7.9046 ms | |
38.17 us 37.319 us 40.069 us | |
6.156 us 2.587 us 10.519 us | |
640x1000 - M.transposeSrc() 100 1 812.533 ms | |
8.17895 ms 8.14204 ms 8.23748 ms | |
232.495 us 159.016 us 357.593 us | |
640x1000 - M.transposeDst() 100 1 820.454 ms | |
8.21139 ms 8.18048 ms 8.24553 ms | |
166.413 us 146.152 us 192.761 us | |
640x1000 - M.transposeTilingSO() 100 1 551.423 ms | |
5.58898 ms 5.5692 ms 5.61138 ms | |
107.271 us 92.915 us 127.154 us | |
640x1000 - M.transposeTiling() 100 1 539.407 ms | |
5.56831 ms 5.4872 ms 5.71791 ms | |
543.574 us 340.069 us 929.592 us | |
640x1000 - M.transposeOptim() 100 1 565.028 ms | |
5.69277 ms 5.62122 ms 5.93922 ms | |
604.531 us 187.6 us 1.36717 ms | |
800x640 - M.transposeSrc() 100 1 541.873 ms | |
5.37478 ms 5.34312 ms 5.41696 ms | |
186.691 us 147.59 us 252.971 us | |
800x640 - M.transposeDst() 100 1 700.771 ms | |
6.99152 ms 6.96815 ms 7.01857 ms | |
128.46 us 106.449 us 161.095 us | |
800x640 - M.transposeTilingSO() 100 1 424.757 ms | |
4.42832 ms 4.36525 ms 4.61912 ms | |
507.931 us 193.717 us 1.10549 ms | |
800x640 - M.transposeTiling() 100 1 398.23 ms | |
4.04877 ms 4.01931 ms 4.09474 ms | |
184.445 us 133.435 us 303.767 us | |
800x640 - M.transposeOptim() 100 1 434.316 ms | |
4.36581 ms 4.31834 ms 4.46928 ms | |
340.103 us 154.871 us 586.834 us | |
640x500 - M.transposeSrc() 100 1 289.336 ms | |
3.32501 ms 3.22103 ms 3.45449 ms | |
588.368 us 505.994 us 739.506 us | |
640x500 - M.transposeDst() 100 1 646.412 ms | |
4.10666 ms 4.07737 ms 4.1477 ms | |
176.079 us 133.739 us 241.46 us | |
640x500 - M.transposeTilingSO() 100 1 313.07 ms | |
3.13385 ms 3.0495 ms 3.46781 ms | |
745.226 us 161.203 us 1.73092 ms | |
640x500 - M.transposeTiling() 100 1 279.971 ms | |
2.49484 ms 2.46777 ms 2.52341 ms | |
142.876 us 128.217 us 161.907 us | |
640x500 - M.transposeOptim() 100 1 292.134 ms | |
2.90779 ms 2.86965 ms 2.9782 ms | |
256.395 us 154.102 us 406.094 us | |
500x640 - M.transposeSrc() 100 1 371.786 ms | |
3.7282 ms 3.71351 ms 3.74981 ms | |
89.155 us 66.169 us 124.668 us | |
500x640 - M.transposeDst() 100 1 288.338 ms | |
2.92213 ms 2.90493 ms 2.94413 ms | |
98.936 us 81.054 us 128.952 us | |
500x640 - M.transposeTilingSO() 100 1 261.724 ms | |
2.60663 ms 2.59201 ms 2.62781 ms | |
88.934 us 67.512 us 144.408 us | |
500x640 - M.transposeTiling() 100 1 237.29 ms | |
2.38999 ms 2.37095 ms 2.41382 ms | |
108.116 us 90.8 us 133.621 us | |
500x640 - M.transposeOptim() 100 1 233.814 ms | |
2.39261 ms 2.37263 ms 2.43491 ms | |
142.436 us 82.648 us 279.288 us | |
640x837 - M.transposeSrc() 100 1 681.346 ms | |
6.7371 ms 6.71323 ms 6.77297 ms | |
146.595 us 108.376 us 220.458 us | |
640x837 - M.transposeDst() 100 1 720.69 ms | |
7.28952 ms 7.26036 ms 7.34868 ms | |
203.885 us 122.192 us 395.995 us | |
640x837 - M.transposeTilingSO() 100 1 595.576 ms | |
6.02544 ms 6.00303 ms 6.05725 ms | |
134.962 us 102.382 us 194.305 us | |
640x837 - M.transposeTiling() 100 1 439.721 ms | |
4.45634 ms 4.38057 ms 4.69761 ms | |
617.912 us 176.905 us 1.36078 ms | |
640x837 - M.transposeOptim() 100 1 594.039 ms | |
6.05717 ms 5.95265 ms 6.53803 ms | |
987.605 us 92.679 us 2.34693 ms | |
837x640 - M.transposeSrc() 100 1 615.768 ms | |
6.18926 ms 6.15459 ms 6.27144 ms | |
256.563 us 134.705 us 524.513 us | |
837x640 - M.transposeDst() 100 1 735.981 ms | |
7.38238 ms 7.35619 ms 7.41189 ms | |
141.714 us 120.568 us 166.691 us | |
837x640 - M.transposeTilingSO() 100 1 454.766 ms | |
4.50993 ms 4.48762 ms 4.55183 ms | |
151.788 us 97.711 us 281.057 us | |
837x640 - M.transposeTiling() 100 1 419.902 ms | |
4.21865 ms 4.18954 ms 4.26208 ms | |
179.769 us 129.087 us 267.846 us | |
837x640 - M.transposeOptim() 100 1 426.574 ms | |
4.26096 ms 4.17571 ms 4.59033 ms | |
788.384 us 102.962 us 1.86637 ms | |
=============================================================================== | |
All tests passed (12802962 assertions in 1 test case) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
benchmark name samples iterations estimated | |
mean low mean high mean | |
std dev low std dev high std dev | |
------------------------------------------------------------------------------- | |
701x1503 - M.transposeSrc() 100 1 357.183 ms | |
3.46899 ms 3.43505 ms 3.50642 ms | |
181.466 us 163.964 us 205.134 us | |
701x1503 - M.transposeDst() 100 1 353.707 ms | |
3.47051 ms 3.42374 ms 3.52123 ms | |
247.313 us 219.43 us 283.555 us | |
701x1503 - M.transposeTilingSO() 100 1 320.089 ms | |
3.22827 ms 3.17116 ms 3.29639 ms | |
317.913 us 278.554 us 369.349 us | |
701x1503 - M.transposeTiling() 100 1 259.815 ms | |
2.77319 ms 2.72398 ms 2.82446 ms | |
255.077 us 235.877 us 285.615 us | |
701x1503 - M.transposeOptim() 100 1 260.578 ms | |
2.71571 ms 2.66495 ms 2.77453 ms | |
278.662 us 243.735 us 332.941 us | |
1791x837 - M.transposeSrc() 100 1 568.204 ms | |
5.53622 ms 5.49697 ms 5.58348 ms | |
220.496 us 186.936 us 295.12 us | |
1791x837 - M.transposeDst() 100 1 811.527 ms | |
8.04237 ms 8.01087 ms 8.07725 ms | |
169.187 us 144.091 us 207.33 us | |
1791x837 - M.transposeTilingSO() 100 1 468.185 ms | |
4.52806 ms 4.48044 ms 4.57947 ms | |
251.963 us 228.824 us 283.423 us | |
1791x837 - M.transposeTiling() 100 1 412.859 ms | |
4.10997 ms 4.04017 ms 4.18759 ms | |
376.66 us 338.979 us 422.86 us | |
1791x837 - M.transposeOptim() 100 1 392.493 ms | |
4.13067 ms 4.07017 ms 4.19579 ms | |
321.46 us 292.933 us 355.637 us | |
1201x1201 - M.transposeSrc() 100 1 502.524 ms | |
4.92291 ms 4.88083 ms 4.97149 ms | |
229.526 us 198.987 us 272.446 us | |
1201x1201 - M.transposeDst() 100 1 554.466 ms | |
5.53983 ms 5.49058 ms 5.5966 ms | |
269.172 us 229.191 us 326.663 us | |
1201x1201 - M.transposeTilingSO() 100 1 433.002 ms | |
4.44824 ms 4.3968 ms 4.50724 ms | |
280.264 us 244.828 us 330.229 us | |
1201x1201 - M.transposeTiling() 100 1 399.054 ms | |
3.75262 ms 3.70161 ms 3.80912 ms | |
272.986 us 247.791 us 303.901 us | |
1201x1201 - M.transposeOptim() 100 1 397.614 ms | |
3.84316 ms 3.77968 ms 3.91483 ms | |
344.915 us 302.133 us 410.838 us | |
1024x1024 - M.transposeSrc() 100 1 541.995 ms | |
5.61084 ms 5.54408 ms 5.68312 ms | |
354.174 us 321.713 us 392.375 us | |
1024x1024 - M.transposeDst() 100 1 449.142 ms | |
4.29921 ms 4.2461 ms 4.36585 ms | |
303.533 us 250.368 us 382.601 us | |
1024x1024 - M.transposeTilingSO() 100 1 282.063 ms | |
2.74389 ms 2.67914 ms 2.82097 ms | |
359.898 us 312.862 us 418.617 us | |
1024x1024 - M.transposeTiling() 100 1 471.38 ms | |
4.78148 ms 4.73581 ms 4.8339 ms | |
248.556 us 215.324 us 301.822 us | |
1024x1024 - M.transposeOptim() 100 1 269.492 ms | |
2.72223 ms 2.67947 ms 2.77115 ms | |
232.913 us 203.185 us 266.345 us | |
2000x2000 - M.transposeSrc() 100 1 2.26015 s | |
22.3536 ms 22.2669 ms 22.4523 ms | |
473.398 us 414.672 us 554.137 us | |
2000x2000 - M.transposeDst() 100 1 2.38719 s | |
23.7494 ms 23.6728 ms 23.8316 ms | |
403.62 us 360.271 us 459.177 us | |
2000x2000 - M.transposeTilingSO() 100 1 1.17682 s | |
12.136 ms 12.068 ms 12.2013 ms | |
340.222 us 308.22 us 378.588 us | |
2000x2000 - M.transposeTiling() 100 1 1.18385 s | |
12.0904 ms 12.0238 ms 12.1592 ms | |
346.494 us 308.903 us 397.502 us | |
2000x2000 - M.transposeOptim() 100 1 1.20988 s | |
12.0665 ms 11.9959 ms 12.1398 ms | |
368.575 us 331.096 us 416.114 us | |
10x6 - M.transposeSrc() 100 128 2.048 ms | |
171 ns 171 ns 172 ns | |
2 ns 0 ns 4 ns | |
10x6 - M.transposeDst() 100 141 2.0445 ms | |
147 ns 147 ns 148 ns | |
1 ns 0 ns 3 ns | |
10x6 - M.transposeTilingSO() 100 129 2.0382 ms | |
160 ns 160 ns 161 ns | |
1 ns 0 ns 2 ns | |
10x6 - M.transposeTiling() 100 133 2.0482 ms | |
156 ns 156 ns 156 ns | |
0 ns 0 ns 1 ns | |
10x6 - M.transposeOptim() 100 133 2.0482 ms | |
150 ns 149 ns 150 ns | |
1 ns 0 ns 2 ns | |
100x6 - M.transposeSrc() 100 19 2.0501 ms | |
1.064 us 1.061 us 1.068 us | |
17 ns 12 ns 30 ns | |
100x6 - M.transposeDst() 100 25 2.0625 ms | |
799 ns 798 ns 803 ns | |
8 ns 0 ns 20 ns | |
100x6 - M.transposeTilingSO() 100 22 2.0636 ms | |
938 ns 936 ns 943 ns | |
13 ns 1 ns 28 ns | |
100x6 - M.transposeTiling() 100 22 2.1032 ms | |
959 ns 957 ns 965 ns | |
11 ns 1 ns 27 ns | |
100x6 - M.transposeOptim() 100 20 2.122 ms | |
1.073 us 1.072 us 1.078 us | |
10 ns 1 ns 25 ns | |
500x6 - M.transposeSrc() 100 5 2.497 ms | |
5.126 us 5.103 us 5.162 us | |
143 ns 100 ns 244 ns | |
500x6 - M.transposeDst() 100 6 2.2788 ms | |
3.861 us 3.76 us 4.183 us | |
817 ns 271 ns 1.791 us | |
500x6 - M.transposeTilingSO() 100 5 2.2025 ms | |
4.245 us 4.238 us 4.283 us | |
75 ns 4 ns 180 ns | |
500x6 - M.transposeTiling() 100 5 2.286 ms | |
4.393 us 4.385 us 4.433 us | |
80 ns 4 ns 191 ns | |
500x6 - M.transposeOptim() 100 4 2.0456 ms | |
4.93 us 4.917 us 4.967 us | |
101 ns 41 ns 221 ns | |
1000x6 - M.transposeSrc() 100 3 3.0453 ms | |
9.868 us 9.837 us 9.917 us | |
193 ns 127 ns 334 ns | |
1000x6 - M.transposeDst() 100 3 2.5881 ms | |
8.516 us 8.492 us 8.611 us | |
207 ns 14 ns 478 ns | |
1000x6 - M.transposeTilingSO() 100 3 2.5995 ms | |
8.731 us 8.705 us 8.822 us | |
204 ns 10 ns 463 ns | |
1000x6 - M.transposeTiling() 100 3 2.658 ms | |
9.543 us 9.152 us 10.529 us | |
2.785 us 290 ns 5.08 us | |
1000x6 - M.transposeOptim() 100 3 2.9787 ms | |
9.874 us 9.765 us 10.363 us | |
995 ns 132 ns 2.345 us | |
2000x6 - M.transposeSrc() 100 2 4.025 ms | |
20.42 us 20.027 us 21.552 us | |
3.104 us 1.248 us 6.61 us | |
2000x6 - M.transposeDst() 100 2 3.5252 ms | |
17.426 us 17.095 us 18.248 us | |
2.44 us 664 ns 4.363 us | |
2000x6 - M.transposeTilingSO() 100 2 3.5348 ms | |
17.287 us 17.213 us 17.438 us | |
517 ns 318 ns 990 ns | |
2000x6 - M.transposeTiling() 100 2 3.6406 ms | |
17.949 us 17.756 us 18.696 us | |
1.676 us 393 ns 3.856 us | |
2000x6 - M.transposeOptim() 100 2 4.04 ms | |
20.078 us 20.027 us 20.191 us | |
372 ns 201 ns 665 ns | |
10x64 - M.transposeSrc() 100 18 2.097 ms | |
1.164 us 1.161 us 1.173 us | |
24 ns 3 ns 46 ns | |
10x64 - M.transposeDst() 100 23 2.0907 ms | |
908 ns 906 ns 913 ns | |
13 ns 1 ns 26 ns | |
10x64 - M.transposeTilingSO() 100 20 2.08 ms | |
1.053 us 1.052 us 1.058 us | |
13 ns 2 ns 25 ns | |
10x64 - M.transposeTiling() 100 21 2.1105 ms | |
1.079 us 1.033 us 1.161 us | |
305 ns 196 ns 459 ns | |
10x64 - M.transposeOptim() 100 23 2.1229 ms | |
897 ns 895 ns 901 ns | |
12 ns 5 ns 27 ns | |
100x64 - M.transposeSrc() 100 2 2.3078 ms | |
11.234 us 11.206 us 11.314 us | |
214 ns 20 ns 451 ns | |
100x64 - M.transposeDst() 100 2 2.1264 ms | |
10.271 us 10.229 us 10.383 us | |
312 ns 31 ns 634 ns | |
100x64 - M.transposeTilingSO() 100 3 2.9781 ms | |
9.643 us 9.614 us 9.756 us | |
247 ns 15 ns 574 ns | |
100x64 - M.transposeTiling() 100 3 2.8785 ms | |
9.79 us 9.741 us 9.878 us | |
328 ns 218 ns 558 ns | |
100x64 - M.transposeOptim() 100 3 2.946 ms | |
9.86 us 9.826 us 9.948 us | |
246 ns 23 ns 480 ns | |
500x64 - M.transposeSrc() 100 1 5.8866 ms | |
59.45 us 59.302 us 60.007 us | |
1.275 us 319 ns 2.903 us | |
500x64 - M.transposeDst() 100 1 6.9596 ms | |
69.204 us 68.983 us 69.972 us | |
1.866 us 548 ns 4.22 us | |
500x64 - M.transposeTilingSO() 100 1 4.9472 ms | |
50.91 us 49.509 us 53.688 us | |
9.687 us 5.463 us 15.69 us | |
500x64 - M.transposeTiling() 100 1 4.8529 ms | |
47.505 us 47.249 us 48.545 us | |
2.271 us 378 ns 5.241 us | |
500x64 - M.transposeOptim() 100 1 5.9125 ms | |
58.801 us 58.094 us 60.208 us | |
4.889 us 2.854 us 8.003 us | |
1000x64 - M.transposeSrc() 100 1 11.0419 ms | |
111.456 us 110.932 us 112.431 us | |
3.532 us 2.271 us 6.113 us | |
1000x64 - M.transposeDst() 100 1 17.0416 ms | |
170.834 us 170.262 us 172.324 us | |
4.369 us 2.139 us 9.125 us | |
1000x64 - M.transposeTilingSO() 100 1 9.823 ms | |
101.293 us 100.45 us 103.571 us | |
6.502 us 2.678 us 13.594 us | |
1000x64 - M.transposeTiling() 100 1 9.0109 ms | |
92.09 us 91.258 us 94.558 us | |
6.581 us 1.879 us 14.07 us | |
1000x64 - M.transposeOptim() 100 1 11.0237 ms | |
111.293 us 109.605 us 114.179 us | |
11.05 us 7.378 us 16.659 us | |
2000x64 - M.transposeSrc() 100 1 22.8322 ms | |
227.92 us 224.329 us 233.785 us | |
23.126 us 16.069 us 32.818 us | |
2000x64 - M.transposeDst() 100 1 36.995 ms | |
363.675 us 359.523 us 369.659 us | |
25.172 us 19.141 us 34.099 us | |
2000x64 - M.transposeTilingSO() 100 1 20.4045 ms | |
204.861 us 203.054 us 209.214 us | |
13.514 us 5.392 us 26.409 us | |
2000x64 - M.transposeTiling() 100 1 18.8847 ms | |
189.95 us 188.249 us 193.99 us | |
12.54 us 5.765 us 24.877 us | |
2000x64 - M.transposeOptim() 100 1 22.7242 ms | |
229.34 us 227.362 us 232.385 us | |
12.285 us 9.268 us 19.122 us | |
6x10 - M.transposeSrc() 100 128 2.048 ms | |
173 ns 173 ns 175 ns | |
3 ns 0 ns 7 ns | |
6x10 - M.transposeDst() 100 138 2.0424 ms | |
152 ns 152 ns 152 ns | |
1 ns 0 ns 2 ns | |
6x10 - M.transposeTilingSO() 100 126 2.0538 ms | |
166 ns 165 ns 167 ns | |
3 ns 1 ns 7 ns | |
6x10 - M.transposeTiling() 100 128 2.0352 ms | |
160 ns 159 ns 160 ns | |
1 ns 0 ns 3 ns | |
6x10 - M.transposeOptim() 100 131 2.0436 ms | |
158 ns 154 ns 165 ns | |
28 ns 19 ns 38 ns | |
6x100 - M.transposeSrc() 100 19 2.0615 ms | |
1.101 us 1.096 us 1.105 us | |
23 ns 14 ns 34 ns | |
6x100 - M.transposeDst() 100 23 2.0861 ms | |
875 ns 874 ns 879 ns | |
9 ns 1 ns 22 ns | |
6x100 - M.transposeTilingSO() 100 20 2.074 ms | |
1.049 us 1.047 us 1.053 us | |
16 ns 10 ns 25 ns | |
6x100 - M.transposeTiling() 100 21 2.1042 ms | |
1.035 us 1.033 us 1.038 us | |
9 ns 5 ns 18 ns | |
6x100 - M.transposeOptim() 100 23 2.1091 ms | |
927 ns 926 ns 932 ns | |
12 ns 5 ns 27 ns | |
6x500 - M.transposeSrc() 100 5 2.5145 ms | |
4.836 us 4.817 us 4.868 us | |
123 ns 85 ns 197 ns | |
6x500 - M.transposeDst() 100 5 2.152 ms | |
4.175 us 4.165 us 4.204 us | |
71 ns 9 ns 156 ns | |
6x500 - M.transposeTilingSO() 100 5 2.5 ms | |
4.864 us 4.857 us 4.891 us | |
64 ns 5 ns 152 ns | |
6x500 - M.transposeTiling() 100 5 2.24 ms | |
4.533 us 4.462 us 4.777 us | |
607 ns 189 ns 1.375 us | |
6x500 - M.transposeOptim() 100 5 2.183 ms | |
4.377 us 4.362 us 4.404 us | |
98 ns 63 ns 184 ns | |
6x1000 - M.transposeSrc() 100 2 2.3416 ms | |
11.711 us 11.664 us 11.776 us | |
278 ns 218 ns 415 ns | |
6x1000 - M.transposeDst() 100 3 2.5656 ms | |
8.586 us 8.531 us 8.777 us | |
455 ns 90 ns 1.014 us | |
6x1000 - M.transposeTilingSO() 100 3 2.9394 ms | |
9.903 us 9.883 us 9.969 us | |
157 ns 8 ns 349 ns | |
6x1000 - M.transposeTiling() 100 2 2.137 ms | |
10.714 us 10.678 us 10.801 us | |
270 ns 132 ns 566 ns | |
6x1000 - M.transposeOptim() 100 3 2.6178 ms | |
8.576 us 8.557 us 8.642 us | |
161 ns 54 ns 362 ns | |
6x2000 - M.transposeSrc() 100 1 2.4143 ms | |
24.308 us 24.179 us 24.712 us | |
1.029 us 245 ns 2.235 us | |
6x2000 - M.transposeDst() 100 2 3.4362 ms | |
16.733 us 16.69 us 16.852 us | |
337 ns 121 ns 714 ns | |
6x2000 - M.transposeTilingSO() 100 2 3.9948 ms | |
19.536 us 19.35 us 20.299 us | |
1.652 us 270 ns 3.856 us | |
6x2000 - M.transposeTiling() 100 1 2.2519 ms | |
22.449 us 22.106 us 23.324 us | |
2.584 us 779 ns 4.813 us | |
6x2000 - M.transposeOptim() 100 2 3.5188 ms | |
17.554 us 17.371 us 18.236 us | |
1.586 us 415 ns 3.629 us | |
640x1000 - M.transposeSrc() 100 1 289.281 ms | |
2.8153 ms 2.78917 ms 2.84514 ms | |
143.091 us 126.887 us 159.577 us | |
640x1000 - M.transposeDst() 100 1 171.014 ms | |
1.60794 ms 1.57746 ms 1.64136 ms | |
162.392 us 146.745 us 194.924 us | |
640x1000 - M.transposeTilingSO() 100 1 152.985 ms | |
1.49469 ms 1.47102 ms 1.5226 ms | |
131.178 us 114.096 us 150.067 us | |
640x1000 - M.transposeTiling() 100 1 149.863 ms | |
1.52566 ms 1.49141 ms 1.56711 ms | |
191.479 us 164.621 us 226.526 us | |
640x1000 - M.transposeOptim() 100 1 148.478 ms | |
1.54903 ms 1.5167 ms 1.58977 ms | |
185.29 us 155.03 us 227.128 us | |
800x640 - M.transposeSrc() 100 1 169.205 ms | |
1.73036 ms 1.70934 ms 1.75401 ms | |
114.835 us 101.702 us 129.548 us | |
800x640 - M.transposeDst() 100 1 162.073 ms | |
1.66443 ms 1.61781 ms 1.71729 ms | |
252.499 us 223.028 us 284.325 us | |
800x640 - M.transposeTilingSO() 100 1 113.913 ms | |
1.19372 ms 1.16122 ms 1.23329 ms | |
181.937 us 155.571 us 220.84 us | |
800x640 - M.transposeTiling() 100 1 109.643 ms | |
1.13409 ms 1.10902 ms 1.16273 ms | |
137.122 us 121 us 155.28 us | |
800x640 - M.transposeOptim() 100 1 114.597 ms | |
1.171 ms 1.14468 ms 1.20163 ms | |
145.198 us 128.907 us 162.895 us | |
640x500 - M.transposeSrc() 100 1 132.556 ms | |
1.34859 ms 1.33202 ms 1.36889 ms | |
93.265 us 78.314 us 119.684 us | |
640x500 - M.transposeDst() 100 1 63.4138 ms | |
646.873 us 629.822 us 670.009 us | |
100.843 us 81.151 us 140.241 us | |
640x500 - M.transposeTilingSO() 100 1 63.1857 ms | |
663.314 us 642.978 us 686.554 us | |
110.759 us 98.833 us 122.304 us | |
640x500 - M.transposeTiling() 100 1 62.2165 ms | |
634.846 us 618.136 us 656.279 us | |
96.002 us 79.693 us 120.969 us | |
640x500 - M.transposeOptim() 100 1 64.32 ms | |
643.628 us 623.666 us 666.202 us | |
107.93 us 95.638 us 120.288 us | |
500x640 - M.transposeSrc() 100 1 78.2872 ms | |
766.578 us 755.671 us 780.743 us | |
62.735 us 51.324 us 81.669 us | |
500x640 - M.transposeDst() 100 1 83.5064 ms | |
796.774 us 783.67 us 813.958 us | |
76.212 us 62.608 us 100.141 us | |
500x640 - M.transposeTilingSO() 100 1 60.4457 ms | |
586.66 us 575.191 us 600.019 us | |
63.125 us 55.79 us 80.027 us | |
500x640 - M.transposeTiling() 100 1 60.3372 ms | |
587.566 us 580.262 us 597.417 us | |
42.86 us 34.976 us 62.777 us | |
500x640 - M.transposeOptim() 100 1 59.5159 ms | |
596.746 us 586.509 us 609.393 us | |
57.613 us 49.225 us 71.918 us | |
640x837 - M.transposeSrc() 100 1 236.482 ms | |
2.34727 ms 2.33144 ms 2.36639 ms | |
88.172 us 74.732 us 105.038 us | |
640x837 - M.transposeDst() 100 1 148.285 ms | |
1.39128 ms 1.35915 ms 1.42674 ms | |
172.051 us 155.703 us 189.89 us | |
640x837 - M.transposeTilingSO() 100 1 130.99 ms | |
1.26723 ms 1.24334 ms 1.29448 ms | |
130.094 us 115.504 us 155.898 us | |
640x837 - M.transposeTiling() 100 1 123.241 ms | |
1.15381 ms 1.13562 ms 1.17656 ms | |
103.489 us 86.369 us 125.177 us | |
640x837 - M.transposeOptim() 100 1 131.093 ms | |
1.2513 ms 1.22887 ms 1.27694 ms | |
122.607 us 108.589 us 138.794 us | |
837x640 - M.transposeSrc() 100 1 141.651 ms | |
1.36352 ms 1.34847 ms 1.38188 ms | |
84.671 us 72.238 us 99.24 us | |
837x640 - M.transposeDst() 100 1 175.583 ms | |
1.65168 ms 1.6168 ms 1.69022 ms | |
187.367 us 167.584 us 207.404 us | |
837x640 - M.transposeTilingSO() 100 1 119.761 ms | |
1.14924 ms 1.12977 ms 1.1717 ms | |
106.828 us 94.149 us 121.626 us | |
837x640 - M.transposeTiling() 100 1 111.963 ms | |
1.06767 ms 1.05048 ms 1.08961 ms | |
98.186 us 81.284 us 122.473 us | |
837x640 - M.transposeOptim() 100 1 111.887 ms | |
1.06025 ms 1.04424 ms 1.07948 ms | |
89.609 us 77.382 us 103.938 us | |
=============================================================================== | |
All tests passed (26448914 assertions in 1 test case) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment