Created
January 24, 2013 13:26
-
-
Save 5kg/4621553 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
操作系统编程中,程序的性能至关重要。现代计算机体系结构中,CPU缓存对性能起着至关重要的影响。本系列文章通过实验,说明一些合理利用CPU缓存的建议。 | |
== 顺序存取内存 | |
本文通过简单的并行数组求和的程序,来对缓存性能进行测试。这里实现了两个函数 `foo()` 和 `bar()` ,其中 `foo()` 交错的读取数组元素进行求和,而 `bar()` 则将数组分块后,顺序读取求和。程序如下: | |
[source, c] | |
---- | |
#include <time.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <pthread.h> | |
#include <string.h> | |
#include <assert.h> | |
#define tic() do { struct timespec ts_start, ts_end; \ | |
clock_gettime(CLOCK_MONOTONIC, &ts_start) | |
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \ | |
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + \ | |
(double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \ | |
while (0) | |
#define test(func) memset(s, 0, sizeof(s)); \ | |
ans = 0; \ | |
tic(); \ | |
for (int i = 0; i < NUM_THREADS; ++i) \ | |
pthread_create(threads + i, NULL, func, s + i); \ | |
for (int i = 0; i < NUM_THREADS; ++i) { \ | |
pthread_join(threads[i], NULL); \ | |
ans += s[i]; } \ | |
toc(); \ | |
assert(ans == ANS); | |
#define NUM_THREADS 4 | |
#define N 536870912 | |
void* foo(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
for (int i = (r - s); i < N; i += NUM_THREADS) | |
*r += num[i]; | |
return NULL; | |
} | |
void* bar(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
int idx = r - s; | |
int block = N/NUM_THREADS; | |
int start = idx * block, end = start + block; | |
for (int i = start; i < end; ++i) | |
*r += num[i]; | |
return NULL; | |
} | |
int* num; | |
int s[NUM_THREADS]; | |
int main() | |
{ | |
pthread_t threads[NUM_THREADS]; | |
int ans, ANS = 0; | |
num = (int*) malloc(sizeof(int) * N); | |
srand(42); | |
for (int i = 0; i < N; ++i) { | |
num[i] = rand() % 100; | |
ANS += num[i]; | |
} | |
test(foo); | |
test(bar); | |
return 0; | |
} | |
---- | |
测试结果如下: | |
---- | |
$ gcc ping-pong.c -std=gnu99 -lpthread -O2 ; ./a.out | |
1.171061s | |
0.406542s | |
---- | |
我们可以看到 `foo()` 近三倍的慢于 `bar()` 。而两种方法的计算量是一致的,只有数据的存取顺序不同,是什么导致了如此大的性能差异?可以猜想是缓存命中率的区别。为了验证,我们可以用 `valgrind` 来profile缓存的命中情况,结果如下: | |
---- | |
$ valgrind --tool=cachegrind --cachegrind-out-file=profile ./a.out | |
.... | |
$ cg_annotate profile --auto=yes --show=D1mr --context=1 | |
.... | |
-- line 63 ---------------------------------------- | |
. void* foo(void* ptr) | |
0 { | |
0 int* r = (int*) ptr; | |
. | |
0 for (int i = (r - s); i < N; i += NUM_THREADS) | |
16,388 *r += num[i]; | |
0 return NULL; | |
0 } | |
. | |
-- line 71 ---------------------------------------- | |
-- line 72 ---------------------------------------- | |
. void* bar(void* ptr) | |
0 { | |
0 int* r = (int*) ptr; | |
0 int idx = r - s; | |
0 int block = N/NUM_THREADS; | |
0 int start = idx * block, end = start + block; | |
. | |
0 for (int i = start; i < end; ++i) | |
4,098 *r += num[i]; | |
0 return NULL; | |
0 } | |
---- | |
结果一目了然,由于 `foo()` 没有顺序的读取内存,其每个线程读取的页数是 `bar` 的四倍,于是导致了四倍的缺页错误。 | |
为此,在读写内存时,应尽可能的顺序读取,保证数据的局部性,以减少缺页而引起的性能损失。 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <time.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <pthread.h> | |
#include <string.h> | |
#include <assert.h> | |
#define tic() do { struct timespec ts_start, ts_end; clock_gettime(CLOCK_MONOTONIC, &ts_start) | |
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \ | |
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + (double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \ | |
while (0) | |
#define test(func) memset(s, 0, sizeof(s)); \ | |
ans = 0; \ | |
tic(); \ | |
for (int i = 0; i < NUM_THREADS; ++i) \ | |
pthread_create(threads + i, NULL, func, s + i); \ | |
for (int i = 0; i < NUM_THREADS; ++i) { \ | |
pthread_join(threads[i], NULL); \ | |
ans += s[i]; } \ | |
toc(); \ | |
assert(ans == ANS); | |
#define NUM_THREADS 4 | |
#define N 536870912 | |
void* foo(void *); | |
void* bar(void *); | |
int* num; | |
int s[NUM_THREADS]; | |
int main() | |
{ | |
pthread_t threads[NUM_THREADS]; | |
int ans, ANS = 0; | |
num = (int*) malloc(sizeof(int) * N); | |
srand(42); | |
for (int i = 0; i < N; ++i) { | |
num[i] = rand() % 100; | |
ANS += num[i]; | |
} | |
test(foo); | |
test(bar); | |
return 0; | |
} | |
void* foo(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
for (int i = (r - s); i < N; i += NUM_THREADS) | |
*r += num[i]; | |
return NULL; | |
} | |
void* bar(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
int idx = r - s; | |
int block = N/NUM_THREADS; | |
int start = idx * block, end = start + block; | |
for (int i = start; i < end; ++i) | |
*r += num[i]; | |
return NULL; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment