bjourne/mul4.c

## mul4.c
// Notes:
//
// * unsigned int vs. int: makes a small difference for clang but
//   probably not for gcc.
// * best tiling appears to be 256x256x256.
//
// 12.31 for two 8192 matrices
#include <assert.h>
#include <math.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <xmmintrin.h>

#ifndef SIZE
#define SIZE            256
#endif

#ifndef TILE_I
#define TILE_I          32
#endif

#ifndef TILE_J
#define TILE_J          32
#endif

#ifndef TILE_K
#define TILE_K          32
#endif

#ifndef N_THREADS
#define N_THREADS       12
#endif

#define SIMD_HEIGHT     2
#define SIMD_WIDTH      16
#define N_BYTES         (SIZE*SIZE*sizeof(float))
#define MIN(a, b)       ((a > b) ? (b) : (a))

typedef unsigned int uint_t;

void
mul_slow(float *A, float *B, float *C) {
    for (uint_t i = 0; i < SIZE; i++) {
        for (uint_t j = 0; j < SIZE; j++) {
            float v = 0;
            for (uint_t k = 0; k < SIZE; k++) {
                v += A[SIZE * i + k] * B[SIZE * k + j];
            }
            C[SIZE * i + j] = v;
        }
    }
}

void
print_mat(float *M, int size) {
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            printf("%3.0f ", M[i * size + j]);
        }
        printf("\n");
    }
    printf("\n");
}

void
mul_slow_tile(
    int i0, int i1,
    int j0, int j1,
    int k0, int k1,
    float * restrict A,
    float * restrict B,
    float * restrict C
) {
    for (int i = i0; i < i1; i ++) {
        for (int k = k0; k < k1; k++) {
            float a = A[SIZE * i + k];
            for (int j = j0; j < j1; j++) {
                float b = B[SIZE * k + j];
                C[SIZE * i + j] += a * b;
            }
        }
    }
}

static void
mul_fast_tile_16x2(
    uint_t i0, uint_t i1,
    uint_t j0, uint_t j1,
    uint_t k0, uint_t k1,
    float * restrict Apt,
    float * restrict Bpt,
    float * restrict C
) {
    for (uint_t i = i0; i < i1; i += SIMD_HEIGHT) {
        float * restrict Bptr = Bpt;
        float * restrict Cptr0 = &C[SIZE * (i + 0) + j0];
        float * restrict Cptr1 = &C[SIZE * (i + 1) + j0];
        for (uint_t j = j0; j < j1; j += SIMD_WIDTH) {
            __m128 acc00 = _mm_load_ps(Cptr0 + 0);
            __m128 acc01 = _mm_load_ps(Cptr0 + 4);
            __m128 acc02 = _mm_load_ps(Cptr0 + 8);
            __m128 acc03 = _mm_load_ps(Cptr0 + 12);

            __m128 acc10 = _mm_load_ps(Cptr1 + 0);
            __m128 acc11 = _mm_load_ps(Cptr1 + 4);
            __m128 acc12 = _mm_load_ps(Cptr1 + 8);
            __m128 acc13 = _mm_load_ps(Cptr1 + 12);

            float * restrict Aptr = &Apt[SIZE * i + 2 * k0];
            for (uint_t k = k0; k < k1; k++) {
                __m128 a0 = _mm_set1_ps(*Aptr++);
                __m128 a1 = _mm_set1_ps(*Aptr++);
                __m128 b0 = _mm_load_ps(Bptr + 0);
                //Bptr += 4;
                __m128 b1 = _mm_load_ps(Bptr + 4);
                //Bptr += 4;
                __m128 b2 = _mm_load_ps(Bptr + 8);
                //Bptr += 4;
                __m128 b3 = _mm_load_ps(Bptr + 12);
                Bptr += 16;
                acc00 = _mm_add_ps(acc00, _mm_mul_ps(a0,  b0));
                acc01 = _mm_add_ps(acc01, _mm_mul_ps(a0,  b1));
                acc02 = _mm_add_ps(acc02, _mm_mul_ps(a0,  b2));
                acc03 = _mm_add_ps(acc03, _mm_mul_ps(a0,  b3));

                acc10 = _mm_add_ps(acc10, _mm_mul_ps(a1,  b0));
                acc11 = _mm_add_ps(acc11, _mm_mul_ps(a1,  b1));
                acc12 = _mm_add_ps(acc12, _mm_mul_ps(a1,  b2));
                acc13 = _mm_add_ps(acc13, _mm_mul_ps(a1,  b3));
            }
            _mm_store_ps(Cptr0 +  0, acc00);
            _mm_store_ps(Cptr0 +  4, acc01);
            _mm_store_ps(Cptr0 +  8, acc02);
            _mm_store_ps(Cptr0 + 12, acc03);

            _mm_store_ps(Cptr1 +  0, acc10);
            _mm_store_ps(Cptr1 +  4, acc11);
            _mm_store_ps(Cptr1 +  8, acc12);
            _mm_store_ps(Cptr1 + 12, acc13);

            Cptr0 += 16;
            Cptr1 += 16;
        }
    }
}

/* static void */
/* mul_fast_tile_nd( */
/*     uint_t i0, uint_t i1, */
/*     uint_t j0, uint_t j1, */
/*     uint_t k0, uint_t k1, */
/*     float * restrict A, */
/*     float * restrict Bpt, */
/*     float * restrict C */
/* ) { */
/*     for (uint_t i = i0; i < i1; i += SIMD_HEIGHT) { */
/*         float *Bptr = Bpt; */
/*         for (uint_t j = j0; j < j1; j += SIMD_WIDTH) { */
/*             __m128 acc[SIMD_HEIGHT][SIMD_WIDTH / 4]; */
/*             for (uint_t y = 0; y < SIMD_HEIGHT; y++)  { */
/*                 for (uint_t x = 0; x < SIMD_WIDTH/4; x++)  { */
/*                     acc[y][x] = _mm_load_ps(&C[SIZE * (i + y) + j + 4*x]); */
/*                 } */
/*             } */
/*             for (uint_t k = k0; k < k1; k++) { */
/*                 __m128 a[SIMD_HEIGHT]; */
/*                 for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
/*                     a[y] = _mm_set1_ps(A[SIZE * (i + y) + k]); */
/*                 } */
/*                 __m128 b[SIMD_WIDTH / 4]; */
/*                 for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */
/*                     b[x] = _mm_load_ps(Bptr); */
/*                     Bptr += 4; */
/*                 } */
/*                 for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
/*                     for (uint_t x = 0; x < SIMD_WIDTH/4; x++)  { */
/*                         acc[y][x] = _mm_add_ps(acc[y][x], */
/*                                                _mm_mul_ps(a[y],  b[x])); */
/*                     } */
/*                 } */
/*             } */
/*             for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
/*                 for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */
/*                     _mm_store_ps(&C[SIZE * (i + y) + j +  4*x], */
/*                                  acc[y][x]); */
/*                 } */
/*             } */
/*         } */
/*     } */
/* } */

typedef struct {
    float *A, *Bp, *C;
    uint_t start_i, end_i;
} mul_job_t;

static void *
mul_thread(void *arg) {
    mul_job_t *job = (mul_job_t *)arg;
    float *A = job->A;
    float *Bp = job->Bp;
    float *C = job->C;
    uint_t start_i = job->start_i;
    uint_t end_i = job->end_i;
    for (uint_t i = start_i; i < end_i; i += TILE_I) {
        uint_t imax = MIN(i + TILE_I, SIZE);
        for (uint_t j = 0; j < SIZE; j += TILE_J) {
            uint_t jmax = MIN(j + TILE_J, SIZE);
            for (uint_t k = 0; k < SIZE; k += TILE_K) {
                uint_t kmax = MIN(k + TILE_K, SIZE);
                float *Bptr = &Bp[k * SIZE + j * TILE_K];
                mul_fast_tile_16x2(i, imax, j, jmax, k, kmax,
                                   A, Bptr, C);
            }
        }
    }
    return 0;
}

static float *Abuf = NULL;
static float *Bbuf = NULL;

static void
mul_fast(float * restrict A,
         float * restrict B,
         float * restrict C) {

    assert(SIZE % TILE_I == 0);
    assert(SIZE % TILE_J == 0);
    assert(SIZE % TILE_K == 0);
    assert(TILE_I % SIMD_HEIGHT == 0);
    assert(TILE_J % SIMD_WIDTH == 0);
    assert(Bbuf);

    float *Bptr = Bbuf;
    for (uint_t k = 0; k < SIZE; k += TILE_K) {
        for (uint_t j = 0; j < SIZE; j += TILE_J) {
            for (uint_t y = 0; y < TILE_J; y += SIMD_WIDTH)  {
                for (uint_t x = 0; x < TILE_K; x++) {
                    uint_t src = SIZE * (k + x) + (j + y);
                    for (uint_t o = 0; o < SIMD_WIDTH; o++)  {
                        *Bptr++ = B[src + o];
                    }
                }
            }
        }
    }

    float *Aptr = Abuf;
    for (int i = 0; i < SIZE; i += SIMD_HEIGHT) {
        for (int k = 0; k < SIZE; k++) {
            *Aptr++ = A[SIZE * (i + 0) + k];
            *Aptr++ = A[SIZE * (i + 1) + k];
        }
    }

    pthread_t threads[N_THREADS];
    mul_job_t jobs[N_THREADS];
    int n_i_tiles = (int)ceil((float)SIZE / (float)TILE_I);
    int tiles_per_thread = (int)ceil((float)n_i_tiles / (float)N_THREADS);
    for (int i = 0; i < N_THREADS; i++) {
        int start = TILE_I * i * tiles_per_thread;
        int end = MIN(TILE_I * (i + 1) * tiles_per_thread, SIZE);
        jobs[i] = (mul_job_t){Abuf, Bbuf, C, start, end};
        pthread_create(&threads[i], NULL, mul_thread, &jobs[i]);
    }
    for (int i = 0; i < N_THREADS; i++) {
        pthread_join(threads[i], NULL);
    }
}

int
main(int argc, char *argv[]) {


    float *A = malloc(N_BYTES);
    float *B = malloc(N_BYTES);
    float *C = calloc(N_BYTES, 1);
    float *C2 = calloc(N_BYTES, 1);

    Abuf = malloc(N_BYTES);
    Bbuf = malloc(N_BYTES);

    for  (int i = 0; i < SIZE; i++) {
        for (int j = 0; j < SIZE; j++) {
            A[SIZE * i + j] = (rand() / (float)RAND_MAX) * 5;
            B[SIZE * i + j] = (rand() / (float)RAND_MAX) * 5;
        }
    }
    //mul_slow((float *)A, (float *)B, (float *)C2);
    struct timespec begin, end;
    clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
    mul_fast((float *)A, (float *)B, (float *)C);
    clock_gettime(CLOCK_MONOTONIC_RAW, &end);
    double delta = (end.tv_nsec - begin.tv_nsec) / 1000000000.0 +
        (end.tv_sec  - begin.tv_sec);
    float gflops = (long)SIZE * (long)SIZE * (long)SIZE
        / (delta * 1000.0 * 1000.0 * 1000.0);
    //printf("%.6lfs, %.2f gflops\n", delta, gflops);

    printf("%5d %8d %6d %6d %6d %6.2f %7.2f\n",
           SIZE, N_THREADS, TILE_I, TILE_J, TILE_K, delta, gflops);

    /* for (int i = 0; i < SIZE; i++) { */
    /*     for (int j = 0; j < SIZE; j++) { */
    /*         float v = C[SIZE * i + j]; */
    /*         float v2 = C2[SIZE * i + j]; */
    /*         float diff = fabs(v - v2); */
    /*         if (diff > 0.1) { */
    /*             printf("%6.2f %6.2f\n", v, v2); */
    /*             assert(false); */
    /*         } */
    /*     } */
    /* } */
    free(A);
    free(B);
    free(C);
    free(C2);
    free(Abuf);
    free(Bbuf);

    return 0;
}
	// Notes:
	//
	// * unsigned int vs. int: makes a small difference for clang but
	// probably not for gcc.
	// * best tiling appears to be 256x256x256.
	//
	// 12.31 for two 8192 matrices
	#include <assert.h>
	#include <math.h>
	#include <pthread.h>
	#include <stdbool.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <time.h>
	#include <xmmintrin.h>

	#ifndef SIZE
	#define SIZE 256
	#endif

	#ifndef TILE_I
	#define TILE_I 32
	#endif

	#ifndef TILE_J
	#define TILE_J 32
	#endif

	#ifndef TILE_K
	#define TILE_K 32
	#endif

	#ifndef N_THREADS
	#define N_THREADS 12
	#endif

	#define SIMD_HEIGHT 2
	#define SIMD_WIDTH 16
	#define N_BYTES (SIZESIZEsizeof(float))
	#define MIN(a, b) ((a > b) ? (b) : (a))

	typedef unsigned int uint_t;

	void
	mul_slow(float A, float B, float *C) {
	for (uint_t i = 0; i < SIZE; i++) {
	for (uint_t j = 0; j < SIZE; j++) {
	float v = 0;
	for (uint_t k = 0; k < SIZE; k++) {
	v += A[SIZE * i + k] * B[SIZE * k + j];
	}
	C[SIZE * i + j] = v;
	}
	}
	}

	void
	print_mat(float *M, int size) {
	for (int i = 0; i < size; i++) {
	for (int j = 0; j < size; j++) {
	printf("%3.0f ", M[i * size + j]);
	}
	printf("\n");
	}
	printf("\n");
	}

	void
	mul_slow_tile(
	int i0, int i1,
	int j0, int j1,
	int k0, int k1,
	float * restrict A,
	float * restrict B,
	float * restrict C
	) {
	for (int i = i0; i < i1; i ++) {
	for (int k = k0; k < k1; k++) {
	float a = A[SIZE * i + k];
	for (int j = j0; j < j1; j++) {
	float b = B[SIZE * k + j];
	C[SIZE * i + j] += a * b;
	}
	}
	}
	}

	static void
	mul_fast_tile_16x2(
	uint_t i0, uint_t i1,
	uint_t j0, uint_t j1,
	uint_t k0, uint_t k1,
	float * restrict Apt,
	float * restrict Bpt,
	float * restrict C
	) {
	for (uint_t i = i0; i < i1; i += SIMD_HEIGHT) {
	float * restrict Bptr = Bpt;
	float * restrict Cptr0 = &C[SIZE * (i + 0) + j0];
	float * restrict Cptr1 = &C[SIZE * (i + 1) + j0];
	for (uint_t j = j0; j < j1; j += SIMD_WIDTH) {
	__m128 acc00 = _mm_load_ps(Cptr0 + 0);
	__m128 acc01 = _mm_load_ps(Cptr0 + 4);
	__m128 acc02 = _mm_load_ps(Cptr0 + 8);
	__m128 acc03 = _mm_load_ps(Cptr0 + 12);

	__m128 acc10 = _mm_load_ps(Cptr1 + 0);
	__m128 acc11 = _mm_load_ps(Cptr1 + 4);
	__m128 acc12 = _mm_load_ps(Cptr1 + 8);
	__m128 acc13 = _mm_load_ps(Cptr1 + 12);

	float * restrict Aptr = &Apt[SIZE * i + 2 * k0];
	for (uint_t k = k0; k < k1; k++) {
	__m128 a0 = _mm_set1_ps(*Aptr++);
	__m128 a1 = _mm_set1_ps(*Aptr++);
	__m128 b0 = _mm_load_ps(Bptr + 0);
	//Bptr += 4;
	__m128 b1 = _mm_load_ps(Bptr + 4);
	//Bptr += 4;
	__m128 b2 = _mm_load_ps(Bptr + 8);
	//Bptr += 4;
	__m128 b3 = _mm_load_ps(Bptr + 12);
	Bptr += 16;
	acc00 = _mm_add_ps(acc00, _mm_mul_ps(a0, b0));
	acc01 = _mm_add_ps(acc01, _mm_mul_ps(a0, b1));
	acc02 = _mm_add_ps(acc02, _mm_mul_ps(a0, b2));
	acc03 = _mm_add_ps(acc03, _mm_mul_ps(a0, b3));

	acc10 = _mm_add_ps(acc10, _mm_mul_ps(a1, b0));
	acc11 = _mm_add_ps(acc11, _mm_mul_ps(a1, b1));
	acc12 = _mm_add_ps(acc12, _mm_mul_ps(a1, b2));
	acc13 = _mm_add_ps(acc13, _mm_mul_ps(a1, b3));
	}
	_mm_store_ps(Cptr0 + 0, acc00);
	_mm_store_ps(Cptr0 + 4, acc01);
	_mm_store_ps(Cptr0 + 8, acc02);
	_mm_store_ps(Cptr0 + 12, acc03);

	_mm_store_ps(Cptr1 + 0, acc10);
	_mm_store_ps(Cptr1 + 4, acc11);
	_mm_store_ps(Cptr1 + 8, acc12);
	_mm_store_ps(Cptr1 + 12, acc13);

	Cptr0 += 16;
	Cptr1 += 16;
	}
	}
	}

	/* static void */
	/* mul_fast_tile_nd( */
	/* uint_t i0, uint_t i1, */
	/* uint_t j0, uint_t j1, */
	/* uint_t k0, uint_t k1, */
	/* float * restrict A, */
	/* float * restrict Bpt, */
	/* float * restrict C */
	/* ) { */
	/* for (uint_t i = i0; i < i1; i += SIMD_HEIGHT) { */
	/* float Bptr = Bpt; /
	/* for (uint_t j = j0; j < j1; j += SIMD_WIDTH) { */
	/* __m128 acc[SIMD_HEIGHT][SIMD_WIDTH / 4]; */
	/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
	/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */
	/* acc[y][x] = _mm_load_ps(&C[SIZE * (i + y) + j + 4x]); /
	/* } */
	/* } */
	/* for (uint_t k = k0; k < k1; k++) { */
	/* __m128 a[SIMD_HEIGHT]; */
	/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
	/* a[y] = _mm_set1_ps(A[SIZE * (i + y) + k]); */
	/* } */
	/* __m128 b[SIMD_WIDTH / 4]; */
	/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */
	/* b[x] = _mm_load_ps(Bptr); */
	/* Bptr += 4; */
	/* } */
	/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
	/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */
	/* acc[y][x] = _mm_add_ps(acc[y][x], */
	/* _mm_mul_ps(a[y], b[x])); */
	/* } */
	/* } */
	/* } */
	/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */
	/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */
	/* _mm_store_ps(&C[SIZE * (i + y) + j + 4x], /
	/* acc[y][x]); */
	/* } */
	/* } */
	/* } */
	/* } */
	/* } */

	typedef struct {
	float A, Bp, *C;
	uint_t start_i, end_i;
	} mul_job_t;

	static void *
	mul_thread(void *arg) {
	mul_job_t job = (mul_job_t )arg;
	float *A = job->A;
	float *Bp = job->Bp;
	float *C = job->C;
	uint_t start_i = job->start_i;
	uint_t end_i = job->end_i;
	for (uint_t i = start_i; i < end_i; i += TILE_I) {
	uint_t imax = MIN(i + TILE_I, SIZE);
	for (uint_t j = 0; j < SIZE; j += TILE_J) {
	uint_t jmax = MIN(j + TILE_J, SIZE);
	for (uint_t k = 0; k < SIZE; k += TILE_K) {
	uint_t kmax = MIN(k + TILE_K, SIZE);
	float Bptr = &Bp[k SIZE + j * TILE_K];
	mul_fast_tile_16x2(i, imax, j, jmax, k, kmax,
	A, Bptr, C);
	}
	}
	}
	return 0;
	}

	static float *Abuf = NULL;
	static float *Bbuf = NULL;

	static void
	mul_fast(float * restrict A,
	float * restrict B,
	float * restrict C) {

	assert(SIZE % TILE_I == 0);
	assert(SIZE % TILE_J == 0);
	assert(SIZE % TILE_K == 0);
	assert(TILE_I % SIMD_HEIGHT == 0);
	assert(TILE_J % SIMD_WIDTH == 0);
	assert(Bbuf);

	float *Bptr = Bbuf;
	for (uint_t k = 0; k < SIZE; k += TILE_K) {
	for (uint_t j = 0; j < SIZE; j += TILE_J) {
	for (uint_t y = 0; y < TILE_J; y += SIMD_WIDTH) {
	for (uint_t x = 0; x < TILE_K; x++) {
	uint_t src = SIZE * (k + x) + (j + y);
	for (uint_t o = 0; o < SIMD_WIDTH; o++) {
	*Bptr++ = B[src + o];
	}
	}
	}
	}
	}

	float *Aptr = Abuf;
	for (int i = 0; i < SIZE; i += SIMD_HEIGHT) {
	for (int k = 0; k < SIZE; k++) {
	Aptr++ = A[SIZE (i + 0) + k];
	Aptr++ = A[SIZE (i + 1) + k];
	}
	}

	pthread_t threads[N_THREADS];
	mul_job_t jobs[N_THREADS];
	int n_i_tiles = (int)ceil((float)SIZE / (float)TILE_I);
	int tiles_per_thread = (int)ceil((float)n_i_tiles / (float)N_THREADS);
	for (int i = 0; i < N_THREADS; i++) {
	int start = TILE_I * i * tiles_per_thread;
	int end = MIN(TILE_I * (i + 1) * tiles_per_thread, SIZE);
	jobs[i] = (mul_job_t){Abuf, Bbuf, C, start, end};
	pthread_create(&threads[i], NULL, mul_thread, &jobs[i]);
	}
	for (int i = 0; i < N_THREADS; i++) {
	pthread_join(threads[i], NULL);
	}
	}

	int
	main(int argc, char *argv[]) {


	float *A = malloc(N_BYTES);
	float *B = malloc(N_BYTES);
	float *C = calloc(N_BYTES, 1);
	float *C2 = calloc(N_BYTES, 1);

	Abuf = malloc(N_BYTES);
	Bbuf = malloc(N_BYTES);

	for (int i = 0; i < SIZE; i++) {
	for (int j = 0; j < SIZE; j++) {
	A[SIZE * i + j] = (rand() / (float)RAND_MAX) * 5;
	B[SIZE * i + j] = (rand() / (float)RAND_MAX) * 5;
	}
	}
	//mul_slow((float )A, (float )B, (float *)C2);
	struct timespec begin, end;
	clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
	mul_fast((float )A, (float )B, (float *)C);
	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
	double delta = (end.tv_nsec - begin.tv_nsec) / 1000000000.0 +
	(end.tv_sec - begin.tv_sec);
	float gflops = (long)SIZE * (long)SIZE * (long)SIZE
	/ (delta * 1000.0 * 1000.0 * 1000.0);
	//printf("%.6lfs, %.2f gflops\n", delta, gflops);

	printf("%5d %8d %6d %6d %6d %6.2f %7.2f\n",
	SIZE, N_THREADS, TILE_I, TILE_J, TILE_K, delta, gflops);

	/* for (int i = 0; i < SIZE; i++) { */
	/* for (int j = 0; j < SIZE; j++) { */
	/* float v = C[SIZE * i + j]; */
	/* float v2 = C2[SIZE * i + j]; */
	/* float diff = fabs(v - v2); */
	/* if (diff > 0.1) { */
	/* printf("%6.2f %6.2f\n", v, v2); */
	/* assert(false); */
	/* } */
	/* } */
	/* } */
	free(A);
	free(B);
	free(C);
	free(C2);
	free(Abuf);
	free(Bbuf);

	return 0;
	}