5kg/align.c

## align.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define N 65536
#define T 10000
#define SIZE 63

struct {
    char s[SIZE];
} __attribute__((aligned(64))) foo[N];

struct {
    char s[SIZE];
} bar[N];

#define tic() do { struct timespec ts_start, ts_end; clock_gettime(CLOCK_MONOTONIC, &ts_start)
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \
              printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + (double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \
              while (0)

int main()
{
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < SIZE; ++j) {
            foo[i].s[j] = rand() % 100;
            bar[i].s[j] = rand() % 100;
        }

    int s;

    s = 0;
    tic();
    for (int i = 0; i < T; ++i) {
        int idx = rand() % N;
        for (int j = 0; j < SIZE; ++j)
            s += foo[idx].s[j];
    }
    toc();
    printf("%d\n", s);

    s = 0;
    tic();
    for (int i = 0; i < T; ++i) {
        int idx = rand() % N;
        for (int j = 0; j < SIZE; ++j)
            s += bar[idx].s[j];
    }
    toc();
    printf("%d\n", s);
}

## align.txt
操作系统编程中，程序的性能至关重要。现代计算机体系结构中，CPU缓存对性能起着至关重要的影响。本系列文章通过实验，说明一些合理利用CPU缓存的建议。

== 对齐缓存线
内核代码中，类似 `__attribute__((aligned(256)))` 的代码，到处可见。下面通过实验验证其对性能的影响。代码中，构造了两个结构数组，结构体大小为63个字节，其中一个对齐到64字节，而另一个没有；构造完成后，随机选取结构数组的元素进行操作。代码如下：

[source, c]
----
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define N 65536
#define T 100000000
#define SIZE 63

struct {
    char s[SIZE];
} __attribute__((aligned(64))) foo[N];

struct {
    char s[SIZE];
} bar[N];

#define tic() do { struct timespec ts_start, ts_end; \
                   clock_gettime(CLOCK_MONOTONIC, &ts_start)
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \
              printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + \
                               (double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \
              while (0)

int main()
{
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < SIZE; ++j) {
            foo[i].s[j] = rand() % 100;
            bar[i].s[j] = rand() % 100;
        }

    int s;

    s = 0;
    tic();
    for (int i = 0; i < T; ++i) {
        int idx = rand() % N;
        for (int j = 0; j < SIZE; ++j)
            s += foo[idx].s[j];
    }
    toc();

    s = 0;
    tic();
    for (int i = 0; i < T; ++i) {
        int idx = rand() % N;
        for (int j = 0; j < SIZE; ++j)
            s += bar[idx].s[j];
    }
    toc();
}
----
运行结果如下：
----
$ gcc align.c -O2 -std=gnu99 ; ./a.out
5.732326s
7.576273s
----
操作对齐过的数据要快得多。

下面来看看缓存的情况：
----
$ valgrind --tool=cachegrind --cachegrind-out-file=profile ./a.out
....
$ cg_annotate profile --auto=yes --show=D1mr,DLmr --context=1
....
     0     0      for (int i = 0; i < T; ++i) {
     0     0          int idx = rand() % N;
     0     0          for (int j = 0; j < SIZE; ++j)
 9,930 2,330              s += foo[idx].s[j];
     .     .      }

     0     0      for (int i = 0; i < T; ++i) {
     0     0          int idx = rand() % N;
     0     0          for (int j = 0; j < SIZE; ++j)
19,547 4,865              s += bar[idx].s[j];
     .     .      }
----

果然不出所料，操作未对齐的数组，会出现更多的缓存未命中。由于数据没有对齐，对单独一个结构体操作，往往可能会导致两次页错误，这便是导致性能损失的原因。

由上面的实验，在一些情况下，我们可以牺牲一些空间，对齐数据，而换取速度的提升。
	#include <stdio.h>
	#include <stdlib.h>
	#include <time.h>

	#define N 65536
	#define T 10000
	#define SIZE 63

	struct {
	char s[SIZE];
	} __attribute__((aligned(64))) foo[N];

	struct {
	char s[SIZE];
	} bar[N];

	#define tic() do { struct timespec ts_start, ts_end; clock_gettime(CLOCK_MONOTONIC, &ts_start)
	#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \
	printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + (double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \
	while (0)

	int main()
	{
	for (int i = 0; i < N; ++i)
	for (int j = 0; j < SIZE; ++j) {
	foo[i].s[j] = rand() % 100;
	bar[i].s[j] = rand() % 100;
	}

	int s;

	s = 0;
	tic();
	for (int i = 0; i < T; ++i) {
	int idx = rand() % N;
	for (int j = 0; j < SIZE; ++j)
	s += foo[idx].s[j];
	}
	toc();
	printf("%d\n", s);

	s = 0;
	tic();
	for (int i = 0; i < T; ++i) {
	int idx = rand() % N;
	for (int j = 0; j < SIZE; ++j)
	s += bar[idx].s[j];
	}
	toc();
	printf("%d\n", s);
	}
	操作系统编程中，程序的性能至关重要。现代计算机体系结构中，CPU缓存对性能起着至关重要的影响。本系列文章通过实验，说明一些合理利用CPU缓存的建议。

	== 对齐缓存线
	内核代码中，类似 `__attribute__((aligned(256)))` 的代码，到处可见。下面通过实验验证其对性能的影响。代码中，构造了两个结构数组，结构体大小为63个字节，其中一个对齐到64字节，而另一个没有；构造完成后，随机选取结构数组的元素进行操作。代码如下：

	[source, c]
	----
	#include <stdio.h>
	#include <stdlib.h>
	#include <time.h>

	#define N 65536
	#define T 100000000
	#define SIZE 63

	struct {
	char s[SIZE];
	} __attribute__((aligned(64))) foo[N];

	struct {
	char s[SIZE];
	} bar[N];

	#define tic() do { struct timespec ts_start, ts_end; \
	clock_gettime(CLOCK_MONOTONIC, &ts_start)
	#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \
	printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + \
	(double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \
	while (0)

	int main()
	{
	for (int i = 0; i < N; ++i)
	for (int j = 0; j < SIZE; ++j) {
	foo[i].s[j] = rand() % 100;
	bar[i].s[j] = rand() % 100;
	}

	int s;

	s = 0;
	tic();
	for (int i = 0; i < T; ++i) {
	int idx = rand() % N;
	for (int j = 0; j < SIZE; ++j)
	s += foo[idx].s[j];
	}
	toc();

	s = 0;
	tic();
	for (int i = 0; i < T; ++i) {
	int idx = rand() % N;
	for (int j = 0; j < SIZE; ++j)
	s += bar[idx].s[j];
	}
	toc();
	}
	----
	运行结果如下：
	----
	$ gcc align.c -O2 -std=gnu99 ; ./a.out
	5.732326s
	7.576273s
	----
	操作对齐过的数据要快得多。

	下面来看看缓存的情况：
	----
	$ valgrind --tool=cachegrind --cachegrind-out-file=profile ./a.out
	....
	$ cg_annotate profile --auto=yes --show=D1mr,DLmr --context=1
	....
	0 0 for (int i = 0; i < T; ++i) {
	0 0 int idx = rand() % N;
	0 0 for (int j = 0; j < SIZE; ++j)
	9,930 2,330 s += foo[idx].s[j];
	. . }

	0 0 for (int i = 0; i < T; ++i) {
	0 0 int idx = rand() % N;
	0 0 for (int j = 0; j < SIZE; ++j)
	19,547 4,865 s += bar[idx].s[j];
	. . }
	----

	果然不出所料，操作未对齐的数组，会出现更多的缓存未命中。由于数据没有对齐，对单独一个结构体操作，往往可能会导致两次页错误，这便是导致性能损失的原因。

	由上面的实验，在一些情况下，我们可以牺牲一些空间，对齐数据，而换取速度的提升。