Skip to content

Instantly share code, notes, and snippets.

@5kg
Created January 24, 2013 15:35
Show Gist options
  • Save 5kg/4623213 to your computer and use it in GitHub Desktop.
Save 5kg/4623213 to your computer and use it in GitHub Desktop.
操作系统编程中,程序的性能至关重要。现代计算机体系结构中,CPU缓存对性能起着至关重要的影响。本系列文章通过实验,说明一些合理利用CPU缓存的建议。
== 避免缓存同步
上一篇中提到的并行数组求和的程序,还能不能进一步优化呢?这里我们做一个简单的修改,每次求和不把结果写回 `s` ,而是存入临时变量 `sum` ,最后再复制到 `s` 里。请看如下代码:
[source, c]
----
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <assert.h>
#define tic() do { struct timespec ts_start, ts_end; \
clock_gettime(CLOCK_MONOTONIC, &ts_start)
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + \
(double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \
while (0)
#define test(func) memset(s, 0, sizeof(s)); \
ans = 0; \
tic(); \
for (int i = 0; i < NUM_THREADS; ++i) \
pthread_create(threads + i, NULL, func, s + i); \
for (int i = 0; i < NUM_THREADS; ++i) { \
pthread_join(threads[i], NULL); \
ans += s[i]; } \
toc(); \
assert(ans == ANS);
#define NUM_THREADS 4
#define N 536870912
void* foo(void* ptr)
{
int* r = (int*) ptr;
int sum = 0;
for (int i = (r - s); i < N; i += NUM_THREADS)
sum += num[i];
*r = sum;
return NULL;
}
void* bar(void* ptr)
{
int* r = (int*) ptr;
int idx = r - s;
int block = N/NUM_THREADS;
int start = idx * block, end = start + block;
int sum = 0;
for (int i = start; i < end; ++i)
sum += num[i];
*r = sum;
return NULL;
}
int* num;
int s[NUM_THREADS];
int main()
{
pthread_t threads[NUM_THREADS];
int ans, ANS = 0;
num = (int*) malloc(sizeof(int) * N);
srand(42);
for (int i = 0; i < N; ++i) {
num[i] = rand() % 100;
ANS += num[i];
}
test(foo);
test(bar);
return 0;
}
----
测试结果如下:
----
$ gcc ping-pong.c -std=gnu99 -lpthread -O2 ; ./a.out
1.171061s
0.406542s
$ gcc ping-pong2.c -std=gnu99 -lpthread -O2 ; ./a.out
0.669291s
0.271119s
----
可以看到,修改后的程序速度大大提高。
这里的问题在于缓存一致性。多核系统中,不同核心同时修改一块内存区域,则CPU必须保证内容的同步性,如下图所示:
image:cache.png[]
由缓存的性质所决定,每次CPU会把访问地址相邻的整片内存读入缓存。于是即使不同线程不同时读写一个内存单元,CPU也必须隐式的维护整条缓存线的一致性。这样便引入了新的同步开销,降低性能。
为此,可以引入局部变量保存中间结果,以避免维护缓存一致性带来的开销。
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <assert.h>
#define tic() do { struct timespec ts_start, ts_end; clock_gettime(CLOCK_MONOTONIC, &ts_start)
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + (double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \
while (0)
#define test(func) memset(s, 0, sizeof(s)); \
ans = 0; \
tic(); \
for (int i = 0; i < NUM_THREADS; ++i) \
pthread_create(threads + i, NULL, func, s + i); \
for (int i = 0; i < NUM_THREADS; ++i) { \
pthread_join(threads[i], NULL); \
ans += s[i]; } \
toc(); \
assert(ans == ANS);
#define NUM_THREADS 4
#define N 536870912
void* foo(void *);
void* bar(void *);
int* num;
int s[NUM_THREADS];
int main()
{
pthread_t threads[NUM_THREADS];
int ans, ANS = 0;
num = (int*) malloc(sizeof(int) * N);
srand(42);
for (int i = 0; i < N; ++i) {
num[i] = rand() % 100;
ANS += num[i];
}
test(foo);
test(bar);
return 0;
}
void* foo(void* ptr)
{
int* r = (int*) ptr;
int sum = 0;
for (int i = (r - s); i < N; i += NUM_THREADS)
sum += num[i];
*r = sum;
return NULL;
}
void* bar(void* ptr)
{
int* r = (int*) ptr;
int idx = r - s;
int block = N/NUM_THREADS;
int start = idx * block, end = start + block;
int sum = 0;
for (int i = start; i < end; ++i)
sum += num[i];
*r = sum;
return NULL;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment