Skip to content

Instantly share code, notes, and snippets.

View transpose.md

The trick to designing transpose algorithms for both small and large problems is to recognize their simple recursive structure.

For a matrix A, let's denote its transpose by T(A) as a shorthand. First, suppose A is a 2x2 matrix:

    [A00 A01]
A = [A10 A11]

Then we have:

View sse1.py
def bits(x, start, stop):
assert 0 <= start <= stop
return (x >> start) & (1 << (stop - start) - 1)
# Instruction set definition
sse1 = make_instruction_set("SSE1")
m128 = sse1.make_vector_type("m128", float32, 4)
instruction = sse1.instruction
View abstract_matrix.cpp
struct AbstractMatrix {
int m; // number of rows
int n; // number of columns
// Pack block at ib, jb of size mb, nb into dest in row-major format.
virtual void pack_rowmajor(int ib, int jb, int mb, int nb, float *dest) const = 0;
// Unpack row-major matrix from src into block at ib, jb of size mb, nb.
virtual void unpack_rowmajor(int ib, int jb, int mb, int nb, const float *src) = 0;
View transpose.cpp
/*
transpose_ij (10000x5000): gmemops=0.23, min=0.436734, avg=0.455368, relerr=3.42%
transpose_ji (10000x5000): gmemops=0.28, min=0.356635, avg=0.363628, relerr=2.55%
transpose_ij_ij (10000x5000): gmemops=1.53, min=0.065207, avg=0.069465, relerr=6.07%
transpose_rec1 (10000x5000): gmemops=1.44, min=0.069258, avg=0.075378, relerr=8.96%
transpose_rec2 (10000x5000): gmemops=1.37, min=0.072819, avg=0.079644, relerr=8.77%
transpose_ij (100000x100): gmemops=1.70, min=0.011731, avg=0.014102, relerr=9.99%
transpose_ji (100000x100): gmemops=0.23, min=0.086909, avg=0.095706, relerr=3.37%
transpose_ij_ij (100000x100): gmemops=1.73, min=0.011543, avg=0.013170, relerr=6.96%
View msvc_sucks_shit.cpp
struct Matrix {
float *base; // pointer to first element
int m; // number of rows
int n; // number of columns
int drow; // row stride
Matrix(float *base, int m, int n, int drow = 0) : base(base), m(m), n(n), drow(drow ? drow : n) { }
INLINE float& operator()(int i, int j) {
// assert(0 <= i && i < m);
View benchmark.cpp
#if _WIN32
struct Timer {
LARGE_INTEGER win32_freq;
LARGE_INTEGER win32_start;
Timer() {
QueryPerformanceFrequency(&win32_freq);
win32_start.QuadPart = 0;
}
View timing.cpp
#if _WIN32
// measure the time in seconds to run f(). doubles will retain nanosecond precision.
template<typename F>
double timeit(F f) {
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
LARGE_INTEGER start;
QueryPerformanceCounter(&start);
f();
LARGE_INTEGER end;
View tail_recursion_to_iteration.c
// Version 1: Recursion.
node_t *find1(node_t *node, int key) {
if (!node) {
return NULL;
}
if (key < node->key) {
return find1(node->left, key);
} else if (key == node->key) {
return node;
View flame_cholesky.py
def blocks(A, block_size=(1, 1)):
i, j = block_size
while len(A):
yield A[:i, :j], A[:i, j:], A[i:, :j], A[i:, j:]
A = A[i:, j:]
# 2400 ms for 1000x1000 matrix (~0.4 GFLOPS/sec). Equivalent C code is only twice as fast (~0.8 GFLOPS/sec).
# The reason the C code isn't much faster is that it's an O(n^3) algorithm and most of the time is spent in
# the O(n^2) kernel routine for the outer product in A11[:] -= np.outer(A10, A10). Even if we posit that Python
# is 1000x slower than C for the outer loop, that's still 1000n + n^3 vs n^3, which is negligible for n = 1000.
View cholesky_update.py
def givens(x, y):
r = np.hypot(x, y)
if np.isclose(r, 0):
return x, 1, 0
return r, x / r, y / r
def cholesky_update(L, x):
m, n = A.shape
x = x.copy()
for i in range(m):
You can’t perform that action at this time.