Created
September 11, 2022 12:10
-
-
Save bjourne/860b8b509c85bedd9bf2eea418668b4d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Notes: | |
// | |
// * unsigned int vs. int: makes a small difference for clang but | |
// probably not for gcc. | |
// * best tiling appears to be 256x256x256. | |
// | |
// 12.31 for two 8192 matrices | |
#include <assert.h> | |
#include <math.h> | |
#include <pthread.h> | |
#include <stdbool.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <time.h> | |
#include <xmmintrin.h> | |
#ifndef SIZE | |
#define SIZE 256 | |
#endif | |
#ifndef TILE_I | |
#define TILE_I 32 | |
#endif | |
#ifndef TILE_J | |
#define TILE_J 32 | |
#endif | |
#ifndef TILE_K | |
#define TILE_K 32 | |
#endif | |
#ifndef N_THREADS | |
#define N_THREADS 12 | |
#endif | |
#define SIMD_HEIGHT 2 | |
#define SIMD_WIDTH 16 | |
#define N_BYTES (SIZE*SIZE*sizeof(float)) | |
#define MIN(a, b) ((a > b) ? (b) : (a)) | |
typedef unsigned int uint_t; | |
void | |
mul_slow(float *A, float *B, float *C) { | |
for (uint_t i = 0; i < SIZE; i++) { | |
for (uint_t j = 0; j < SIZE; j++) { | |
float v = 0; | |
for (uint_t k = 0; k < SIZE; k++) { | |
v += A[SIZE * i + k] * B[SIZE * k + j]; | |
} | |
C[SIZE * i + j] = v; | |
} | |
} | |
} | |
void | |
print_mat(float *M, int size) { | |
for (int i = 0; i < size; i++) { | |
for (int j = 0; j < size; j++) { | |
printf("%3.0f ", M[i * size + j]); | |
} | |
printf("\n"); | |
} | |
printf("\n"); | |
} | |
void | |
mul_slow_tile( | |
int i0, int i1, | |
int j0, int j1, | |
int k0, int k1, | |
float * restrict A, | |
float * restrict B, | |
float * restrict C | |
) { | |
for (int i = i0; i < i1; i ++) { | |
for (int k = k0; k < k1; k++) { | |
float a = A[SIZE * i + k]; | |
for (int j = j0; j < j1; j++) { | |
float b = B[SIZE * k + j]; | |
C[SIZE * i + j] += a * b; | |
} | |
} | |
} | |
} | |
static void | |
mul_fast_tile_16x2( | |
uint_t i0, uint_t i1, | |
uint_t j0, uint_t j1, | |
uint_t k0, uint_t k1, | |
float * restrict Apt, | |
float * restrict Bpt, | |
float * restrict C | |
) { | |
for (uint_t i = i0; i < i1; i += SIMD_HEIGHT) { | |
float * restrict Bptr = Bpt; | |
float * restrict Cptr0 = &C[SIZE * (i + 0) + j0]; | |
float * restrict Cptr1 = &C[SIZE * (i + 1) + j0]; | |
for (uint_t j = j0; j < j1; j += SIMD_WIDTH) { | |
__m128 acc00 = _mm_load_ps(Cptr0 + 0); | |
__m128 acc01 = _mm_load_ps(Cptr0 + 4); | |
__m128 acc02 = _mm_load_ps(Cptr0 + 8); | |
__m128 acc03 = _mm_load_ps(Cptr0 + 12); | |
__m128 acc10 = _mm_load_ps(Cptr1 + 0); | |
__m128 acc11 = _mm_load_ps(Cptr1 + 4); | |
__m128 acc12 = _mm_load_ps(Cptr1 + 8); | |
__m128 acc13 = _mm_load_ps(Cptr1 + 12); | |
float * restrict Aptr = &Apt[SIZE * i + 2 * k0]; | |
for (uint_t k = k0; k < k1; k++) { | |
__m128 a0 = _mm_set1_ps(*Aptr++); | |
__m128 a1 = _mm_set1_ps(*Aptr++); | |
__m128 b0 = _mm_load_ps(Bptr + 0); | |
//Bptr += 4; | |
__m128 b1 = _mm_load_ps(Bptr + 4); | |
//Bptr += 4; | |
__m128 b2 = _mm_load_ps(Bptr + 8); | |
//Bptr += 4; | |
__m128 b3 = _mm_load_ps(Bptr + 12); | |
Bptr += 16; | |
acc00 = _mm_add_ps(acc00, _mm_mul_ps(a0, b0)); | |
acc01 = _mm_add_ps(acc01, _mm_mul_ps(a0, b1)); | |
acc02 = _mm_add_ps(acc02, _mm_mul_ps(a0, b2)); | |
acc03 = _mm_add_ps(acc03, _mm_mul_ps(a0, b3)); | |
acc10 = _mm_add_ps(acc10, _mm_mul_ps(a1, b0)); | |
acc11 = _mm_add_ps(acc11, _mm_mul_ps(a1, b1)); | |
acc12 = _mm_add_ps(acc12, _mm_mul_ps(a1, b2)); | |
acc13 = _mm_add_ps(acc13, _mm_mul_ps(a1, b3)); | |
} | |
_mm_store_ps(Cptr0 + 0, acc00); | |
_mm_store_ps(Cptr0 + 4, acc01); | |
_mm_store_ps(Cptr0 + 8, acc02); | |
_mm_store_ps(Cptr0 + 12, acc03); | |
_mm_store_ps(Cptr1 + 0, acc10); | |
_mm_store_ps(Cptr1 + 4, acc11); | |
_mm_store_ps(Cptr1 + 8, acc12); | |
_mm_store_ps(Cptr1 + 12, acc13); | |
Cptr0 += 16; | |
Cptr1 += 16; | |
} | |
} | |
} | |
/* static void */ | |
/* mul_fast_tile_nd( */ | |
/* uint_t i0, uint_t i1, */ | |
/* uint_t j0, uint_t j1, */ | |
/* uint_t k0, uint_t k1, */ | |
/* float * restrict A, */ | |
/* float * restrict Bpt, */ | |
/* float * restrict C */ | |
/* ) { */ | |
/* for (uint_t i = i0; i < i1; i += SIMD_HEIGHT) { */ | |
/* float *Bptr = Bpt; */ | |
/* for (uint_t j = j0; j < j1; j += SIMD_WIDTH) { */ | |
/* __m128 acc[SIMD_HEIGHT][SIMD_WIDTH / 4]; */ | |
/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */ | |
/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */ | |
/* acc[y][x] = _mm_load_ps(&C[SIZE * (i + y) + j + 4*x]); */ | |
/* } */ | |
/* } */ | |
/* for (uint_t k = k0; k < k1; k++) { */ | |
/* __m128 a[SIMD_HEIGHT]; */ | |
/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */ | |
/* a[y] = _mm_set1_ps(A[SIZE * (i + y) + k]); */ | |
/* } */ | |
/* __m128 b[SIMD_WIDTH / 4]; */ | |
/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */ | |
/* b[x] = _mm_load_ps(Bptr); */ | |
/* Bptr += 4; */ | |
/* } */ | |
/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */ | |
/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */ | |
/* acc[y][x] = _mm_add_ps(acc[y][x], */ | |
/* _mm_mul_ps(a[y], b[x])); */ | |
/* } */ | |
/* } */ | |
/* } */ | |
/* for (uint_t y = 0; y < SIMD_HEIGHT; y++) { */ | |
/* for (uint_t x = 0; x < SIMD_WIDTH/4; x++) { */ | |
/* _mm_store_ps(&C[SIZE * (i + y) + j + 4*x], */ | |
/* acc[y][x]); */ | |
/* } */ | |
/* } */ | |
/* } */ | |
/* } */ | |
/* } */ | |
typedef struct { | |
float *A, *Bp, *C; | |
uint_t start_i, end_i; | |
} mul_job_t; | |
static void * | |
mul_thread(void *arg) { | |
mul_job_t *job = (mul_job_t *)arg; | |
float *A = job->A; | |
float *Bp = job->Bp; | |
float *C = job->C; | |
uint_t start_i = job->start_i; | |
uint_t end_i = job->end_i; | |
for (uint_t i = start_i; i < end_i; i += TILE_I) { | |
uint_t imax = MIN(i + TILE_I, SIZE); | |
for (uint_t j = 0; j < SIZE; j += TILE_J) { | |
uint_t jmax = MIN(j + TILE_J, SIZE); | |
for (uint_t k = 0; k < SIZE; k += TILE_K) { | |
uint_t kmax = MIN(k + TILE_K, SIZE); | |
float *Bptr = &Bp[k * SIZE + j * TILE_K]; | |
mul_fast_tile_16x2(i, imax, j, jmax, k, kmax, | |
A, Bptr, C); | |
} | |
} | |
} | |
return 0; | |
} | |
static float *Abuf = NULL; | |
static float *Bbuf = NULL; | |
static void | |
mul_fast(float * restrict A, | |
float * restrict B, | |
float * restrict C) { | |
assert(SIZE % TILE_I == 0); | |
assert(SIZE % TILE_J == 0); | |
assert(SIZE % TILE_K == 0); | |
assert(TILE_I % SIMD_HEIGHT == 0); | |
assert(TILE_J % SIMD_WIDTH == 0); | |
assert(Bbuf); | |
float *Bptr = Bbuf; | |
for (uint_t k = 0; k < SIZE; k += TILE_K) { | |
for (uint_t j = 0; j < SIZE; j += TILE_J) { | |
for (uint_t y = 0; y < TILE_J; y += SIMD_WIDTH) { | |
for (uint_t x = 0; x < TILE_K; x++) { | |
uint_t src = SIZE * (k + x) + (j + y); | |
for (uint_t o = 0; o < SIMD_WIDTH; o++) { | |
*Bptr++ = B[src + o]; | |
} | |
} | |
} | |
} | |
} | |
float *Aptr = Abuf; | |
for (int i = 0; i < SIZE; i += SIMD_HEIGHT) { | |
for (int k = 0; k < SIZE; k++) { | |
*Aptr++ = A[SIZE * (i + 0) + k]; | |
*Aptr++ = A[SIZE * (i + 1) + k]; | |
} | |
} | |
pthread_t threads[N_THREADS]; | |
mul_job_t jobs[N_THREADS]; | |
int n_i_tiles = (int)ceil((float)SIZE / (float)TILE_I); | |
int tiles_per_thread = (int)ceil((float)n_i_tiles / (float)N_THREADS); | |
for (int i = 0; i < N_THREADS; i++) { | |
int start = TILE_I * i * tiles_per_thread; | |
int end = MIN(TILE_I * (i + 1) * tiles_per_thread, SIZE); | |
jobs[i] = (mul_job_t){Abuf, Bbuf, C, start, end}; | |
pthread_create(&threads[i], NULL, mul_thread, &jobs[i]); | |
} | |
for (int i = 0; i < N_THREADS; i++) { | |
pthread_join(threads[i], NULL); | |
} | |
} | |
int | |
main(int argc, char *argv[]) { | |
float *A = malloc(N_BYTES); | |
float *B = malloc(N_BYTES); | |
float *C = calloc(N_BYTES, 1); | |
float *C2 = calloc(N_BYTES, 1); | |
Abuf = malloc(N_BYTES); | |
Bbuf = malloc(N_BYTES); | |
for (int i = 0; i < SIZE; i++) { | |
for (int j = 0; j < SIZE; j++) { | |
A[SIZE * i + j] = (rand() / (float)RAND_MAX) * 5; | |
B[SIZE * i + j] = (rand() / (float)RAND_MAX) * 5; | |
} | |
} | |
//mul_slow((float *)A, (float *)B, (float *)C2); | |
struct timespec begin, end; | |
clock_gettime(CLOCK_MONOTONIC_RAW, &begin); | |
mul_fast((float *)A, (float *)B, (float *)C); | |
clock_gettime(CLOCK_MONOTONIC_RAW, &end); | |
double delta = (end.tv_nsec - begin.tv_nsec) / 1000000000.0 + | |
(end.tv_sec - begin.tv_sec); | |
float gflops = (long)SIZE * (long)SIZE * (long)SIZE | |
/ (delta * 1000.0 * 1000.0 * 1000.0); | |
//printf("%.6lfs, %.2f gflops\n", delta, gflops); | |
printf("%5d %8d %6d %6d %6d %6.2f %7.2f\n", | |
SIZE, N_THREADS, TILE_I, TILE_J, TILE_K, delta, gflops); | |
/* for (int i = 0; i < SIZE; i++) { */ | |
/* for (int j = 0; j < SIZE; j++) { */ | |
/* float v = C[SIZE * i + j]; */ | |
/* float v2 = C2[SIZE * i + j]; */ | |
/* float diff = fabs(v - v2); */ | |
/* if (diff > 0.1) { */ | |
/* printf("%6.2f %6.2f\n", v, v2); */ | |
/* assert(false); */ | |
/* } */ | |
/* } */ | |
/* } */ | |
free(A); | |
free(B); | |
free(C); | |
free(C2); | |
free(Abuf); | |
free(Bbuf); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment