Created
January 24, 2013 17:47
-
-
Save 5kg/4625673 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <time.h> | |
#define N 65536 | |
#define T 10000 | |
#define SIZE 63 | |
struct { | |
char s[SIZE]; | |
} __attribute__((aligned(64))) foo[N]; | |
struct { | |
char s[SIZE]; | |
} bar[N]; | |
#define tic() do { struct timespec ts_start, ts_end; clock_gettime(CLOCK_MONOTONIC, &ts_start) | |
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \ | |
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + (double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \ | |
while (0) | |
int main() | |
{ | |
for (int i = 0; i < N; ++i) | |
for (int j = 0; j < SIZE; ++j) { | |
foo[i].s[j] = rand() % 100; | |
bar[i].s[j] = rand() % 100; | |
} | |
int s; | |
s = 0; | |
tic(); | |
for (int i = 0; i < T; ++i) { | |
int idx = rand() % N; | |
for (int j = 0; j < SIZE; ++j) | |
s += foo[idx].s[j]; | |
} | |
toc(); | |
printf("%d\n", s); | |
s = 0; | |
tic(); | |
for (int i = 0; i < T; ++i) { | |
int idx = rand() % N; | |
for (int j = 0; j < SIZE; ++j) | |
s += bar[idx].s[j]; | |
} | |
toc(); | |
printf("%d\n", s); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
操作系统编程中,程序的性能至关重要。现代计算机体系结构中,CPU缓存对性能起着至关重要的影响。本系列文章通过实验,说明一些合理利用CPU缓存的建议。 | |
== 对齐缓存线 | |
内核代码中,类似 `__attribute__((aligned(256)))` 的代码,到处可见。下面通过实验验证其对性能的影响。代码中,构造了两个结构数组,结构体大小为63个字节,其中一个对齐到64字节,而另一个没有;构造完成后,随机选取结构数组的元素进行操作。代码如下: | |
[source, c] | |
---- | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <time.h> | |
#define N 65536 | |
#define T 100000000 | |
#define SIZE 63 | |
struct { | |
char s[SIZE]; | |
} __attribute__((aligned(64))) foo[N]; | |
struct { | |
char s[SIZE]; | |
} bar[N]; | |
#define tic() do { struct timespec ts_start, ts_end; \ | |
clock_gettime(CLOCK_MONOTONIC, &ts_start) | |
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \ | |
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + \ | |
(double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \ | |
while (0) | |
int main() | |
{ | |
for (int i = 0; i < N; ++i) | |
for (int j = 0; j < SIZE; ++j) { | |
foo[i].s[j] = rand() % 100; | |
bar[i].s[j] = rand() % 100; | |
} | |
int s; | |
s = 0; | |
tic(); | |
for (int i = 0; i < T; ++i) { | |
int idx = rand() % N; | |
for (int j = 0; j < SIZE; ++j) | |
s += foo[idx].s[j]; | |
} | |
toc(); | |
s = 0; | |
tic(); | |
for (int i = 0; i < T; ++i) { | |
int idx = rand() % N; | |
for (int j = 0; j < SIZE; ++j) | |
s += bar[idx].s[j]; | |
} | |
toc(); | |
} | |
---- | |
运行结果如下: | |
---- | |
$ gcc align.c -O2 -std=gnu99 ; ./a.out | |
5.732326s | |
7.576273s | |
---- | |
操作对齐过的数据要快得多。 | |
下面来看看缓存的情况: | |
---- | |
$ valgrind --tool=cachegrind --cachegrind-out-file=profile ./a.out | |
.... | |
$ cg_annotate profile --auto=yes --show=D1mr,DLmr --context=1 | |
.... | |
0 0 for (int i = 0; i < T; ++i) { | |
0 0 int idx = rand() % N; | |
0 0 for (int j = 0; j < SIZE; ++j) | |
9,930 2,330 s += foo[idx].s[j]; | |
. . } | |
0 0 for (int i = 0; i < T; ++i) { | |
0 0 int idx = rand() % N; | |
0 0 for (int j = 0; j < SIZE; ++j) | |
19,547 4,865 s += bar[idx].s[j]; | |
. . } | |
---- | |
果然不出所料,操作未对齐的数组,会出现更多的缓存未命中。由于数据没有对齐,对单独一个结构体操作,往往可能会导致两次页错误,这便是导致性能损失的原因。 | |
由上面的实验,在一些情况下,我们可以牺牲一些空间,对齐数据,而换取速度的提升。 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment