Skip to content

Instantly share code, notes, and snippets.

@5kg
Created January 24, 2013 16:04
Show Gist options
  • Save 5kg/4623909 to your computer and use it in GitHub Desktop.
Save 5kg/4623909 to your computer and use it in GitHub Desktop.
操作系统编程中,程序的性能至关重要。现代计算机体系结构中,CPU缓存对性能起着至关重要的影响。本系列文章通过实验,说明一些合理利用CPU缓存的建议。
== 保持数据的局部性
这一次,依然在第一篇的基础上修改, `foo` 完全不变,而仅仅是把 `bar()` ,改为交错的读取内存。代码如下:
[source, c]
----
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <assert.h>
#define tic() do { struct timespec ts_start, ts_end; \
clock_gettime(CLOCK_MONOTONIC, &ts_start)
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + \
(double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \
while (0)
#define test(func) memset(s, 0, sizeof(s)); \
ans = 0; \
tic(); \
for (int i = 0; i < NUM_THREADS; ++i) \
pthread_create(threads + i, NULL, func, s + i); \
for (int i = 0; i < NUM_THREADS; ++i) { \
pthread_join(threads[i], NULL); \
ans += s[i]; } \
toc(); \
assert(ans == ANS);
#define NUM_THREADS 4
#define N 536870912
void* foo(void* ptr)
{
int* r = (int*) ptr;
for (int i = (r - s); i < N; i += NUM_THREADS)
*r += num[i];
return NULL;
}
void* bar(void* ptr)
{
int* r = (int*) ptr;
int idx = r - s;
int block = N/NUM_THREADS;
int start = idx * block, end = start + block;
for (int j = 0; j < NUM_THREADS; ++j)
for (int i = start+j; i < end; i+=NUM_THREADS)
*r += num[i];
return NULL;
}
int* num;
int s[NUM_THREADS];
int main()
{
pthread_t threads[NUM_THREADS];
int ans, ANS = 0;
num = (int*) malloc(sizeof(int) * N);
srand(42);
for (int i = 0; i < N; ++i) {
num[i] = rand() % 100;
ANS += num[i];
}
test(foo);
test(bar);
return 0;
}
----
我们应当预期这时的 `foo()` `bar()` 速度大致相同,因为他们均为交错读取,缓存未命中率应该大致相同。
而测试结果如下:
----
$ gcc ping-pong.c -std=gnu99 -lpthread -O2 ; ./a.out
1.171061s
0.406542s
$ gcc ping-pong1.c -std=gnu99 -lpthread -O2 ; ./a.out
1.107319s
1.496597s
----
奇怪的是 `bar()` 这次大大慢过了 `foo()` 。而两者的缓存未命中数是一样的:
----
$ valgrind --tool=cachegrind --cachegrind-out-file=profile ./a.out
....
$ cg_annotate profile --auto=yes --show=D1mr --context=1
....
-- line 51 ----------------------------------------
. void* foo(void* ptr)
0 {
0 int* r = (int*) ptr;
.
0 for (int i = (r - s); i < N; i += NUM_THREADS)
16,388 *r += num[i];
0 return NULL;
0 }
.
-- line 59 ----------------------------------------
-- line 60 ----------------------------------------
. void* bar(void* ptr)
0 {
0 int* r = (int*) ptr;
0 int idx = r - s;
0 int block = N/NUM_THREADS;
0 int start = idx * block, end = start + block;
.
0 for (int j = 0; j < NUM_THREADS; ++j)
0 for (int i = start+j; i < end; i+=NUM_THREADS)
16,399 *r += num[i];
0 return NULL;
0 }
----
这又作何解释呢?我认为区别在于多个核心共享的L3缓存的命中率(由于没有合适的profile工具,(Intel的profiler应该可以看L3的命中率),以下仅为猜想)。
image:cpu.jpg[]
`foo()` 由于交错的遍历整块内存,在每个线程进度大体一致的前提下,各个线程在任意时刻应该访问相近的内存区域。由于现在CPU的第三级缓存是由多个核共享的,这样便能提高L3缓存的命中率。`bar()` 则将数组分块,各个线程会互相把各自的数据踢出L3缓存,这便导致了性能的损失。
这样由于共享缓存的存在,编程中应该把进程间的数据局部性也考虑进来。Linux内核开发者,也做了一些相关的尝试,如展示时所提到引发PostgresSQL Bug的,将线程尽可能在同一CPU的不同核心调度等思路。
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <assert.h>
#define tic() do { struct timespec ts_start, ts_end; clock_gettime(CLOCK_MONOTONIC, &ts_start)
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + (double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \
while (0)
#define test(func) memset(s, 0, sizeof(s)); \
ans = 0; \
tic(); \
for (int i = 0; i < NUM_THREADS; ++i) \
pthread_create(threads + i, NULL, func, s + i); \
for (int i = 0; i < NUM_THREADS; ++i) { \
pthread_join(threads[i], NULL); \
ans += s[i]; } \
toc(); \
assert(ans == ANS);
#define NUM_THREADS 4
#define N 65536
void* foo(void *);
void* bar(void *);
int* num;
int s[NUM_THREADS];
int main()
{
pthread_t threads[NUM_THREADS];
int ans, ANS = 0;
num = (int*) malloc(sizeof(int) * N);
srand(42);
for (int i = 0; i < N; ++i) {
num[i] = rand() % 100;
ANS += num[i];
}
test(foo);
test(bar);
return 0;
}
void* foo(void* ptr)
{
int* r = (int*) ptr;
for (int i = (r - s); i < N; i += NUM_THREADS)
*r += num[i];
return NULL;
}
void* bar(void* ptr)
{
int* r = (int*) ptr;
int idx = r - s;
int block = N/NUM_THREADS;
int start = idx * block, end = start + block;
for (int j = 0; j < NUM_THREADS; ++j)
for (int i = start+j; i < end; i+=NUM_THREADS)
*r += num[i];
return NULL;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment