Created
January 24, 2013 15:35
-
-
Save 5kg/4623213 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
操作系统编程中,程序的性能至关重要。现代计算机体系结构中,CPU缓存对性能起着至关重要的影响。本系列文章通过实验,说明一些合理利用CPU缓存的建议。 | |
== 避免缓存同步 | |
上一篇中提到的并行数组求和的程序,还能不能进一步优化呢?这里我们做一个简单的修改,每次求和不把结果写回 `s` ,而是存入临时变量 `sum` ,最后再复制到 `s` 里。请看如下代码: | |
[source, c] | |
---- | |
#include <time.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <pthread.h> | |
#include <string.h> | |
#include <assert.h> | |
#define tic() do { struct timespec ts_start, ts_end; \ | |
clock_gettime(CLOCK_MONOTONIC, &ts_start) | |
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \ | |
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + \ | |
(double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \ | |
while (0) | |
#define test(func) memset(s, 0, sizeof(s)); \ | |
ans = 0; \ | |
tic(); \ | |
for (int i = 0; i < NUM_THREADS; ++i) \ | |
pthread_create(threads + i, NULL, func, s + i); \ | |
for (int i = 0; i < NUM_THREADS; ++i) { \ | |
pthread_join(threads[i], NULL); \ | |
ans += s[i]; } \ | |
toc(); \ | |
assert(ans == ANS); | |
#define NUM_THREADS 4 | |
#define N 536870912 | |
void* foo(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
int sum = 0; | |
for (int i = (r - s); i < N; i += NUM_THREADS) | |
sum += num[i]; | |
*r = sum; | |
return NULL; | |
} | |
void* bar(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
int idx = r - s; | |
int block = N/NUM_THREADS; | |
int start = idx * block, end = start + block; | |
int sum = 0; | |
for (int i = start; i < end; ++i) | |
sum += num[i]; | |
*r = sum; | |
return NULL; | |
} | |
int* num; | |
int s[NUM_THREADS]; | |
int main() | |
{ | |
pthread_t threads[NUM_THREADS]; | |
int ans, ANS = 0; | |
num = (int*) malloc(sizeof(int) * N); | |
srand(42); | |
for (int i = 0; i < N; ++i) { | |
num[i] = rand() % 100; | |
ANS += num[i]; | |
} | |
test(foo); | |
test(bar); | |
return 0; | |
} | |
---- | |
测试结果如下: | |
---- | |
$ gcc ping-pong.c -std=gnu99 -lpthread -O2 ; ./a.out | |
1.171061s | |
0.406542s | |
$ gcc ping-pong2.c -std=gnu99 -lpthread -O2 ; ./a.out | |
0.669291s | |
0.271119s | |
---- | |
可以看到,修改后的程序速度大大提高。 | |
这里的问题在于缓存一致性。多核系统中,不同核心同时修改一块内存区域,则CPU必须保证内容的同步性,如下图所示: | |
image:cache.png[] | |
由缓存的性质所决定,每次CPU会把访问地址相邻的整片内存读入缓存。于是即使不同线程不同时读写一个内存单元,CPU也必须隐式的维护整条缓存线的一致性。这样便引入了新的同步开销,降低性能。 | |
为此,可以引入局部变量保存中间结果,以避免维护缓存一致性带来的开销。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <time.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <pthread.h> | |
#include <string.h> | |
#include <assert.h> | |
#define tic() do { struct timespec ts_start, ts_end; clock_gettime(CLOCK_MONOTONIC, &ts_start) | |
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \ | |
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + (double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \ | |
while (0) | |
#define test(func) memset(s, 0, sizeof(s)); \ | |
ans = 0; \ | |
tic(); \ | |
for (int i = 0; i < NUM_THREADS; ++i) \ | |
pthread_create(threads + i, NULL, func, s + i); \ | |
for (int i = 0; i < NUM_THREADS; ++i) { \ | |
pthread_join(threads[i], NULL); \ | |
ans += s[i]; } \ | |
toc(); \ | |
assert(ans == ANS); | |
#define NUM_THREADS 4 | |
#define N 536870912 | |
void* foo(void *); | |
void* bar(void *); | |
int* num; | |
int s[NUM_THREADS]; | |
int main() | |
{ | |
pthread_t threads[NUM_THREADS]; | |
int ans, ANS = 0; | |
num = (int*) malloc(sizeof(int) * N); | |
srand(42); | |
for (int i = 0; i < N; ++i) { | |
num[i] = rand() % 100; | |
ANS += num[i]; | |
} | |
test(foo); | |
test(bar); | |
return 0; | |
} | |
void* foo(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
int sum = 0; | |
for (int i = (r - s); i < N; i += NUM_THREADS) | |
sum += num[i]; | |
*r = sum; | |
return NULL; | |
} | |
void* bar(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
int idx = r - s; | |
int block = N/NUM_THREADS; | |
int start = idx * block, end = start + block; | |
int sum = 0; | |
for (int i = start; i < end; ++i) | |
sum += num[i]; | |
*r = sum; | |
return NULL; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment