Skip to content

Instantly share code, notes, and snippets.

@bjourne
Created September 11, 2022 12:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bjourne/860b8b509c85bedd9bf2eea418668b4d to your computer and use it in GitHub Desktop.
Save bjourne/860b8b509c85bedd9bf2eea418668b4d to your computer and use it in GitHub Desktop.
// Notes:
//
// * unsigned int vs. int: makes a small difference for clang but
// probably not for gcc.
// * best tiling appears to be 256x256x256.
//
// 12.31 for two 8192 matrices
#include <assert.h>
#include <math.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <xmmintrin.h>
#ifndef SIZE
#define SIZE 256
#endif
#ifndef TILE_I
#define TILE_I 32
#endif
#ifndef TILE_J
#define TILE_J 32
#endif
#ifndef TILE_K
#define TILE_K 32
#endif
#ifndef N_THREADS
#define N_THREADS 12
#endif
#define SIMD_HEIGHT 2
#define SIMD_WIDTH 16
#define N_BYTES (SIZE*SIZE*sizeof(float))
#define MIN(a, b) ((a > b) ? (b) : (a))
typedef unsigned int uint_t;
void
mul_slow(float *A, float *B, float *C) {
for (uint_t i = 0; i < SIZE; i++) {
for (uint_t j = 0; j < SIZE; j++) {
float v = 0;
for (uint_t k = 0; k < SIZE; k++) {
v += A[SIZE * i + k] * B[SIZE * k + j];
}
C[SIZE * i + j] = v;
}
}
}
void
print_mat(float *M, int size) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
printf("%3.0f ", M[i * size + j]);
}
printf("\n");
}
printf("\n");
}
void
mul_slow_tile(
int i0, int i1,
int j0, int j1,
int k0, int k1,
float * restrict A,
float * restrict B,
float * restrict C
) {
for (int i = i0; i < i1; i ++) {
for (int k = k0; k < k1; k++) {
float a = A[SIZE * i + k];
for (int j = j0; j < j1; j++) {
float b = B[SIZE * k + j];
C[SIZE * i + j] += a * b;
}
}
}
}
static void
mul_fast_tile_16x2(
uint_t i0, uint_t i1,
uint_t j0, uint_t j1,
uint_t k0, uint_t k1,
float * restrict Apt,
float * restrict Bpt,
float * restrict C
) {
for (uint_t i = i0; i < i1; i += SIMD_HEIGHT) {
float * restrict Bptr = Bpt;
float * restrict Cptr0 = &C[SIZE * (i + 0) + j0];
float * restrict Cptr1 = &C[SIZE * (i + 1) + j0];
for (uint_t j = j0; j < j1; j += SIMD_WIDTH) {
__m128 acc00 = _mm_load_ps(Cptr0 + 0);
__m128 acc01 = _mm_load_ps(Cptr0 + 4);
__m128 acc02 = _mm_load_ps(Cptr0 + 8);
__m128 acc03 = _mm_load_ps(Cptr0 + 12);
__m128 acc10 = _mm_load_ps(Cptr1 + 0);
__m128 acc11 = _mm_load_ps(Cptr1 + 4);
__m128 acc12 = _mm_load_ps(Cptr1 + 8);
__m128 acc13 = _mm_load_ps(Cptr1 + 12);
float * restrict Aptr = &Apt[SIZE * i + 2 * k0];
for (uint_t k = k0; k < k1; k++) {
__m128 a0 = _mm_set1_ps(*Aptr++);
__m128 a1 = _mm_set1_ps(*Aptr++);
__m128 b0 = _mm_load_ps(Bptr + 0);
//Bptr += 4;
__m128 b1 = _mm_load_ps(Bptr + 4);
//Bptr += 4;
__m128 b2 = _mm_load_ps(Bptr + 8);
//Bptr += 4;
__m128 b3 = _mm_load_ps(Bptr + 12);
Bptr += 16;
acc00 = _mm_add_ps(acc00, _mm_mul_ps(a0, b0));
acc01 = _mm_add_ps(acc01, _mm_mul_ps(a0, b1));
acc02 = _mm_add_ps(acc02, _mm_mul_ps(a0, b2));
acc03 = _mm_add_ps(acc03, _mm_mul_ps(a0, b3));
acc10 = _mm_add_ps(acc10, _mm_mul_ps(a1, b0));
acc11 = _mm_add_ps(acc11, _mm_mul_ps(a1, b1));
acc12 = _mm_add_ps(acc12, _mm_mul_ps(a1, b2));
acc13 = _mm_add_ps(acc13, _mm_mul_ps(a1, b3));
}
_mm_store_ps(Cptr0 + 0, acc00);
_mm_store_ps(Cptr0 + 4, acc01);
_mm_store_ps(Cptr0 + 8, acc02);
_mm_store_ps(Cptr0 + 12, acc03);
_mm_store_ps(Cptr1 + 0, acc10);
_mm_store_ps(Cptr1 + 4, acc11);
_mm_store_ps(Cptr1 + 8, acc12);
_mm_store_ps(Cptr1 + 12, acc13);
Cptr0 += 16;
Cptr1 += 16;
}
}
}
/* static void */
/* mul_fast_tile_nd( */
/* uint_t i0, uint_t i1, */
/* uint_t j0, uint_t j1, */
/* uint_t k0, uint_t k1, */
/* float * restrict A, */
/* float * restrict Bpt, */
/* float * restrict C */
/* ) { */
/* for (uint_t i = i0; i < i1; i += SIMD_HEIGHT) { */
/* float *Bptr = Bpt; */
/* for (uint_t j = j0; j < j1; j += SIMD_WIDTH) { */
/* __m128 acc[SIMD_HEIGHT][SIMD_WIDTH / 4]; */
/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */
/* acc[y][x] = _mm_load_ps(&C[SIZE * (i + y) + j + 4*x]); */
/* } */
/* } */
/* for (uint_t k = k0; k < k1; k++) { */
/* __m128 a[SIMD_HEIGHT]; */
/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
/* a[y] = _mm_set1_ps(A[SIZE * (i + y) + k]); */
/* } */
/* __m128 b[SIMD_WIDTH / 4]; */
/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */
/* b[x] = _mm_load_ps(Bptr); */
/* Bptr += 4; */
/* } */
/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */
/* acc[y][x] = _mm_add_ps(acc[y][x], */
/* _mm_mul_ps(a[y], b[x])); */
/* } */
/* } */
/* } */
/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */
/* _mm_store_ps(&C[SIZE * (i + y) + j + 4*x], */
/* acc[y][x]); */
/* } */
/* } */
/* } */
/* } */
/* } */
typedef struct {
float *A, *Bp, *C;
uint_t start_i, end_i;
} mul_job_t;
static void *
mul_thread(void *arg) {
mul_job_t *job = (mul_job_t *)arg;
float *A = job->A;
float *Bp = job->Bp;
float *C = job->C;
uint_t start_i = job->start_i;
uint_t end_i = job->end_i;
for (uint_t i = start_i; i < end_i; i += TILE_I) {
uint_t imax = MIN(i + TILE_I, SIZE);
for (uint_t j = 0; j < SIZE; j += TILE_J) {
uint_t jmax = MIN(j + TILE_J, SIZE);
for (uint_t k = 0; k < SIZE; k += TILE_K) {
uint_t kmax = MIN(k + TILE_K, SIZE);
float *Bptr = &Bp[k * SIZE + j * TILE_K];
mul_fast_tile_16x2(i, imax, j, jmax, k, kmax,
A, Bptr, C);
}
}
}
return 0;
}
static float *Abuf = NULL;
static float *Bbuf = NULL;
static void
mul_fast(float * restrict A,
float * restrict B,
float * restrict C) {
assert(SIZE % TILE_I == 0);
assert(SIZE % TILE_J == 0);
assert(SIZE % TILE_K == 0);
assert(TILE_I % SIMD_HEIGHT == 0);
assert(TILE_J % SIMD_WIDTH == 0);
assert(Bbuf);
float *Bptr = Bbuf;
for (uint_t k = 0; k < SIZE; k += TILE_K) {
for (uint_t j = 0; j < SIZE; j += TILE_J) {
for (uint_t y = 0; y < TILE_J; y += SIMD_WIDTH) {
for (uint_t x = 0; x < TILE_K; x++) {
uint_t src = SIZE * (k + x) + (j + y);
for (uint_t o = 0; o < SIMD_WIDTH; o++) {
*Bptr++ = B[src + o];
}
}
}
}
}
float *Aptr = Abuf;
for (int i = 0; i < SIZE; i += SIMD_HEIGHT) {
for (int k = 0; k < SIZE; k++) {
*Aptr++ = A[SIZE * (i + 0) + k];
*Aptr++ = A[SIZE * (i + 1) + k];
}
}
pthread_t threads[N_THREADS];
mul_job_t jobs[N_THREADS];
int n_i_tiles = (int)ceil((float)SIZE / (float)TILE_I);
int tiles_per_thread = (int)ceil((float)n_i_tiles / (float)N_THREADS);
for (int i = 0; i < N_THREADS; i++) {
int start = TILE_I * i * tiles_per_thread;
int end = MIN(TILE_I * (i + 1) * tiles_per_thread, SIZE);
jobs[i] = (mul_job_t){Abuf, Bbuf, C, start, end};
pthread_create(&threads[i], NULL, mul_thread, &jobs[i]);
}
for (int i = 0; i < N_THREADS; i++) {
pthread_join(threads[i], NULL);
}
}
int
main(int argc, char *argv[]) {
float *A = malloc(N_BYTES);
float *B = malloc(N_BYTES);
float *C = calloc(N_BYTES, 1);
float *C2 = calloc(N_BYTES, 1);
Abuf = malloc(N_BYTES);
Bbuf = malloc(N_BYTES);
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < SIZE; j++) {
A[SIZE * i + j] = (rand() / (float)RAND_MAX) * 5;
B[SIZE * i + j] = (rand() / (float)RAND_MAX) * 5;
}
}
//mul_slow((float *)A, (float *)B, (float *)C2);
struct timespec begin, end;
clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
mul_fast((float *)A, (float *)B, (float *)C);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
double delta = (end.tv_nsec - begin.tv_nsec) / 1000000000.0 +
(end.tv_sec - begin.tv_sec);
float gflops = (long)SIZE * (long)SIZE * (long)SIZE
/ (delta * 1000.0 * 1000.0 * 1000.0);
//printf("%.6lfs, %.2f gflops\n", delta, gflops);
printf("%5d %8d %6d %6d %6d %6.2f %7.2f\n",
SIZE, N_THREADS, TILE_I, TILE_J, TILE_K, delta, gflops);
/* for (int i = 0; i < SIZE; i++) { */
/* for (int j = 0; j < SIZE; j++) { */
/* float v = C[SIZE * i + j]; */
/* float v2 = C2[SIZE * i + j]; */
/* float diff = fabs(v - v2); */
/* if (diff > 0.1) { */
/* printf("%6.2f %6.2f\n", v, v2); */
/* assert(false); */
/* } */
/* } */
/* } */
free(A);
free(B);
free(C);
free(C2);
free(Abuf);
free(Bbuf);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment