Skip to content

Instantly share code, notes, and snippets.

@pervognsen
Last active February 22, 2023 22:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pervognsen/e9ebbb5220bf3a9b688854dc5ec12dbf to your computer and use it in GitHub Desktop.
Save pervognsen/e9ebbb5220bf3a9b688854dc5ec12dbf to your computer and use it in GitHub Desktop.
struct Matrix {
float *base; // pointer to first element
int m; // number of rows
int n; // number of columns
int drow; // row stride
Matrix(float *base, int m, int n, int drow = 0) : base(base), m(m), n(n), drow(drow ? drow : n) { }
INLINE float& operator()(int i, int j) {
// assert(0 <= i && i < m);
// assert(0 <= j && j < n);
return base[i*drow + j];
}
// block starting at i, j with clamped size mb, nb
Matrix block(int i, int j, int mb, int nb) {
return {base + i*drow + j, clamp(mb, 0, m - i), clamp(nb, 0, n - j), drow};
}
};
void gemm_ijk(Matrix C, Matrix A, Matrix B) {
int m = C.m;
int n = C.n;
int p = A.n;
assert(A.m == m);
assert(B.n == n);
assert(B.m == p);
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < p; k++) {
// MSVC 2017 generates 3 IMULs + ADDs per inner loop. That's 3 billion IMULs + 3 billion ADDs for 1000x1000.
// ICC recognizes and reduces the induction variables, so there's only two ADDs per inner loop.
C(i, j) += A(i, k) * B(k, j);
}
}
}
}
// For 1000x1000, this is 2x faster than gemm_ijk on MSVC and the same speed as gemm_ijk on ICC.
void gemm_ijk_ptr(Matrix C, Matrix A, Matrix B) {
int m = C.m;
int n = C.n;
int p = A.n;
assert(A.m == m);
assert(B.n == n);
assert(B.m == p);
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
float *Cptr = &C(i, j);
float *Aptr = &A(i, 0);
float *Bptr = &B(0, j);
for (int k = 0; k < p; k++) {
// C(i, j) += A(i, k) * B(k, j);
*Cptr += *Aptr * *Bptr;
Aptr++;
Bptr += B.drow;
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment