Last active
April 16, 2017 05:34
-
-
Save yangzhixuan/8277291 to your computer and use it in GitHub Desktop.
分块矩阵乘法效率的实验
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* 执行同等大小的矩阵乘法10次的总时间(gcc version 4.8.1, 64位linux),第一项是取块大小为50的分块乘法,第二项是朴素乘法 | |
* | |
* 无任何优化选项 | |
* 500*500 8.05s 8.52s | |
* 1000*1000 64.96s 111.66s | |
* | |
* 开启O2 | |
* 500*500 1.78s 1.89s | |
* 1000*1000 14.46s 88.94s | |
* | |
* 开启-Ofast -march=native的优化选项 | |
* 500*500 1.58s 0.46s | |
* 1000*1000 12.67s 23.15s | |
* | |
* 开启-Ofast -march=native -funroll-loops的优化选项 | |
* 500*500 0.72s 0.35s | |
* 1000*1000 5.9s 23.55s | |
* | |
* 对于较大的矩阵乘法,分块写法的优化简单有效(开启足够多的优化选项,最多能提速6倍)。对于较小的乘法,分块优势不明显。 | |
* */ | |
#include <string.h> | |
#include <assert.h> | |
#include <stdlib.h> | |
#define BLK_SIZE 50 | |
// C = A * B, where A is a n * m, b is m * s matrix | |
void mul(int* C, int* A, int* B, int n, int m, int s) | |
{ | |
memset(C, 0, sizeof(int) * n * s); | |
int i, j, k, i1, j1, k1; | |
for(i = 0; i < n; i += BLK_SIZE) | |
for(j = 0; j < s; j += BLK_SIZE) | |
for(k = 0; k < m; k += BLK_SIZE) | |
for(i1 = i; i1 < i + BLK_SIZE && i1 < n; i1++) | |
for(j1 = j; j1 < j + BLK_SIZE && j1 < s; j1++) | |
for(k1 = k; k1 < k + BLK_SIZE && k1 < m; k1++) | |
/* C[i1][j1] += A[i1][k1] * B[k1][j1] */ | |
C[j1 + i1 * s] += | |
A[k1 + i1 * m] * B[j1 + k1 * s]; | |
} | |
void mul_stupid(int* C, int* A, int* B, int n, int m, int s) | |
{ | |
memset(C, 0, sizeof(int) * n * s); | |
int i, j, k; | |
for(i = 0; i < n; i++) | |
for(j = 0; j < s; j++) | |
for(k = 0; k < m; k++) | |
C[j + i * s] += A[k + i * m] * B[j + k * s]; | |
} | |
int A[500][500], B[500][500], C[500][500]; | |
int main() | |
{ | |
int i, j, k; | |
for(i=0; i<500; i++) | |
for(j=0; j < 500; j++) | |
B[i][j] = rand() % 500; | |
for(i=0; i<500; i++) | |
for(j=0; j <500; j++) | |
A[i][j] = rand() % 500; | |
for(k = 0; k < 10; k++) | |
mul((int*)C, (int*)A, (int*)B, 500,500,500); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment