Skip to content

Instantly share code, notes, and snippets.

@5kg
Created January 24, 2013 17:47
Show Gist options
  • Save 5kg/4625673 to your computer and use it in GitHub Desktop.
Save 5kg/4625673 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 65536
#define T 10000
#define SIZE 63
struct {
char s[SIZE];
} __attribute__((aligned(64))) foo[N];
struct {
char s[SIZE];
} bar[N];
#define tic() do { struct timespec ts_start, ts_end; clock_gettime(CLOCK_MONOTONIC, &ts_start)
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + (double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \
while (0)
int main()
{
for (int i = 0; i < N; ++i)
for (int j = 0; j < SIZE; ++j) {
foo[i].s[j] = rand() % 100;
bar[i].s[j] = rand() % 100;
}
int s;
s = 0;
tic();
for (int i = 0; i < T; ++i) {
int idx = rand() % N;
for (int j = 0; j < SIZE; ++j)
s += foo[idx].s[j];
}
toc();
printf("%d\n", s);
s = 0;
tic();
for (int i = 0; i < T; ++i) {
int idx = rand() % N;
for (int j = 0; j < SIZE; ++j)
s += bar[idx].s[j];
}
toc();
printf("%d\n", s);
}
操作系统编程中,程序的性能至关重要。现代计算机体系结构中,CPU缓存对性能起着至关重要的影响。本系列文章通过实验,说明一些合理利用CPU缓存的建议。
== 对齐缓存线
内核代码中,类似 `__attribute__((aligned(256)))` 的代码,到处可见。下面通过实验验证其对性能的影响。代码中,构造了两个结构数组,结构体大小为63个字节,其中一个对齐到64字节,而另一个没有;构造完成后,随机选取结构数组的元素进行操作。代码如下:
[source, c]
----
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 65536
#define T 100000000
#define SIZE 63
struct {
char s[SIZE];
} __attribute__((aligned(64))) foo[N];
struct {
char s[SIZE];
} bar[N];
#define tic() do { struct timespec ts_start, ts_end; \
clock_gettime(CLOCK_MONOTONIC, &ts_start)
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + \
(double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \
while (0)
int main()
{
for (int i = 0; i < N; ++i)
for (int j = 0; j < SIZE; ++j) {
foo[i].s[j] = rand() % 100;
bar[i].s[j] = rand() % 100;
}
int s;
s = 0;
tic();
for (int i = 0; i < T; ++i) {
int idx = rand() % N;
for (int j = 0; j < SIZE; ++j)
s += foo[idx].s[j];
}
toc();
s = 0;
tic();
for (int i = 0; i < T; ++i) {
int idx = rand() % N;
for (int j = 0; j < SIZE; ++j)
s += bar[idx].s[j];
}
toc();
}
----
运行结果如下:
----
$ gcc align.c -O2 -std=gnu99 ; ./a.out
5.732326s
7.576273s
----
操作对齐过的数据要快得多。
下面来看看缓存的情况:
----
$ valgrind --tool=cachegrind --cachegrind-out-file=profile ./a.out
....
$ cg_annotate profile --auto=yes --show=D1mr,DLmr --context=1
....
0 0 for (int i = 0; i < T; ++i) {
0 0 int idx = rand() % N;
0 0 for (int j = 0; j < SIZE; ++j)
9,930 2,330 s += foo[idx].s[j];
. . }
0 0 for (int i = 0; i < T; ++i) {
0 0 int idx = rand() % N;
0 0 for (int j = 0; j < SIZE; ++j)
19,547 4,865 s += bar[idx].s[j];
. . }
----
果然不出所料,操作未对齐的数组,会出现更多的缓存未命中。由于数据没有对齐,对单独一个结构体操作,往往可能会导致两次页错误,这便是导致性能损失的原因。
由上面的实验,在一些情况下,我们可以牺牲一些空间,对齐数据,而换取速度的提升。
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment