Skip to content

Instantly share code, notes, and snippets.


The trick to designing transpose algorithms for both small and large problems is to recognize their simple recursive structure.

For a matrix A, let's denote its transpose by T(A) as a shorthand. First, suppose A is a 2x2 matrix:

    [A00 A01]
A = [A10 A11]

Then we have:

def bits(x, start, stop):
assert 0 <= start <= stop
return (x >> start) & (1 << (stop - start) - 1)
# Instruction set definition
sse1 = make_instruction_set("SSE1")
m128 = sse1.make_vector_type("m128", float32, 4)
instruction = sse1.instruction
View abstract_matrix.cpp
struct AbstractMatrix {
int m; // number of rows
int n; // number of columns
// Pack block at ib, jb of size mb, nb into dest in row-major format.
virtual void pack_rowmajor(int ib, int jb, int mb, int nb, float *dest) const = 0;
// Unpack row-major matrix from src into block at ib, jb of size mb, nb.
virtual void unpack_rowmajor(int ib, int jb, int mb, int nb, const float *src) = 0;
View transpose.cpp
transpose_ij (10000x5000): gmemops=0.23, min=0.436734, avg=0.455368, relerr=3.42%
transpose_ji (10000x5000): gmemops=0.28, min=0.356635, avg=0.363628, relerr=2.55%
transpose_ij_ij (10000x5000): gmemops=1.53, min=0.065207, avg=0.069465, relerr=6.07%
transpose_rec1 (10000x5000): gmemops=1.44, min=0.069258, avg=0.075378, relerr=8.96%
transpose_rec2 (10000x5000): gmemops=1.37, min=0.072819, avg=0.079644, relerr=8.77%
transpose_ij (100000x100): gmemops=1.70, min=0.011731, avg=0.014102, relerr=9.99%
transpose_ji (100000x100): gmemops=0.23, min=0.086909, avg=0.095706, relerr=3.37%
transpose_ij_ij (100000x100): gmemops=1.73, min=0.011543, avg=0.013170, relerr=6.96%
View msvc_sucks_shit.cpp
struct Matrix {
float *base; // pointer to first element
int m; // number of rows
int n; // number of columns
int drow; // row stride
Matrix(float *base, int m, int n, int drow = 0) : base(base), m(m), n(n), drow(drow ? drow : n) { }
INLINE float& operator()(int i, int j) {
// assert(0 <= i && i < m);
View benchmark.cpp
#if _WIN32
struct Timer {
LARGE_INTEGER win32_freq;
LARGE_INTEGER win32_start;
Timer() {
win32_start.QuadPart = 0;
View timing.cpp
#if _WIN32
// measure the time in seconds to run f(). doubles will retain nanosecond precision.
template<typename F>
double timeit(F f) {
View tail_recursion_to_iteration.c
// Version 1: Recursion.
node_t *find1(node_t *node, int key) {
if (!node) {
return NULL;
if (key < node->key) {
return find1(node->left, key);
} else if (key == node->key) {
return node;
def blocks(A, block_size=(1, 1)):
i, j = block_size
while len(A):
yield A[:i, :j], A[:i, j:], A[i:, :j], A[i:, j:]
A = A[i:, j:]
# 2400 ms for 1000x1000 matrix (~0.4 GFLOPS/sec). Equivalent C code is only twice as fast (~0.8 GFLOPS/sec).
# The reason the C code isn't much faster is that it's an O(n^3) algorithm and most of the time is spent in
# the O(n^2) kernel routine for the outer product in A11[:] -= np.outer(A10, A10). Even if we posit that Python
# is 1000x slower than C for the outer loop, that's still 1000n + n^3 vs n^3, which is negligible for n = 1000.
def givens(x, y):
r = np.hypot(x, y)
if np.isclose(r, 0):
return x, 1, 0
return r, x / r, y / r
def cholesky_update(L, x):
m, n = A.shape
x = x.copy()
for i in range(m):
You can’t perform that action at this time.