Created
December 23, 2011 08:38
-
-
Save anonymous/1513601 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* | |
* 結論: | |
* Cortex-A9 は Out-of-Order だが、手スケジューリングが効く。 | |
* (というか、Cortex-A9はOut-of-Orderに見えない) | |
* | |
* どのプロセッサでも直線アクセスだとL2レイテンシは見えない。 | |
* (ちゃんとHWプリフェッチが効いてる) | |
* | |
* | |
* E350 : | |
* メモリもリオーダーするOut-of-Orderなので | |
* 手でスケジューリングしても全く意味が無い | |
* | |
* Atom : | |
* In-Order。手でスケジューリングするとそれなりに効く | |
* | |
* Cortex-A9 : | |
* Out-of-Orderらしいが、read only でも手でスケジューリングするのが効いてるので謎 | |
* 手でスケジューリングすると、Atom以上に効いてる。ように見える | |
* | |
* | |
* 計測方法: | |
* 分岐命令の数はとりあえず忘れて、 | |
* 4回アンロールして、 | |
* - 手でスケジューリングしたもの | |
* - ただのアンロール | |
* を比較して、アンロールがどの程度効果あるか調べる | |
* | |
* nosched : | |
* out[i+0] = in[i+0] * 1.1f; | |
* out[i+1] = in[i+1] * 1.1f; | |
* out[i+2] = in[i+2] * 1.1f; | |
* out[i+3] = in[i+3] * 1.1f; | |
* | |
* sched : | |
* v0 = in[i+0] * 1.1f; | |
* v1 = in[i+1] * 1.1f; | |
* v2 = in[i+2] * 1.1f; | |
* v3 = in[i+3] * 1.1f; | |
* out[i+0] = v0 * 1.1f; | |
* out[i+1] = v1 * 1.1f; | |
* out[i+2] = v2 * 1.1f; | |
* out[i+3] = v3 * 1.1f; | |
* | |
* read only: | |
* sum0 += in[i+0] * 1.1f; | |
* sum1 += in[i+1] * 1.1f; | |
* sum2 += in[i+2] * 1.1f; | |
* sum3 += in[i+3] * 1.1f; | |
* | |
* read only sched: | |
* v0 = in[i+0] * 1.1f; | |
* v1 = in[i+1] * 1.1f; | |
* v2 = in[i+2] * 1.1f; | |
* v3 = in[i+3] * 1.1f; | |
* sum0 += v0; | |
* sum1 += v1; | |
* sum2 += v2; | |
* sum3 += v3; | |
* | |
* nosched-hazard (ふたつのポインタがオーバーラップして一個前の演算に依存する): | |
* p0 = in | |
* p1 = p1+1 | |
* | |
* p1[i+0] = p0[i+0] * 1.1f; | |
* p1[i+1] = p0[i+1] * 1.1f; | |
* p1[i+2] = p0[i+2] * 1.1f; | |
* p1[i+3] = p0[i+3] * 1.1f; | |
* | |
* | |
* sched-hazard (ふたつのポインタがオーバーラップしてるけど気にしないで手でスケジューリング): | |
* p0 = in | |
* p1 = p1+1 | |
* | |
* v0 = p0[i+0] * 1.1f; | |
* v1 = p0[i+1] * 1.1f; | |
* v2 = p0[i+2] * 1.1f; | |
* v3 = p0[i+3] * 1.1f; | |
* | |
* p1[i+0] = v0 * 1.1f; | |
* p1[i+1] = v1 * 1.1f; | |
* p1[i+2] = v2 * 1.1f; | |
* p1[i+3] = v3 * 1.1f; | |
* | |
* | |
* | |
* | |
* | |
* 結果: | |
* E350 (1.6GHz) : | |
* | $ ./a.out 1024 | |
* | nosched 3108.794678[clk/loop], 3.035932[cyc/data] | |
* | sched 3100.598877[clk/loop], 3.027929[cyc/data] | |
* | read only 2624.018555[clk/loop], 2.562518[cyc/data] | |
* | read only sched 2855.861572[clk/loop], 2.788927[cyc/data] | |
* | nosched-hazard 14360.824951[clk/loop], 14.024243[cyc/data] | |
* | sched-hazard 3122.873291[clk/loop], 3.049681[cyc/data] | |
* | | |
* | $ ./a.out 16384 | |
* | nosched 49393.120605[clk/loop], 3.014717[cyc/data] | |
* | sched 50288.229492[clk/loop], 3.069350[cyc/data] | |
* | read only 41004.123779[clk/loop], 2.502693[cyc/data] | |
* | read only sched 45133.377686[clk/loop], 2.754723[cyc/data] | |
* | nosched-hazard 231441.703857[clk/loop], 14.126081[cyc/data] | |
* | sched-hazard 50742.266602[clk/loop], 3.097062[cyc/data] | |
* | |
* Atom N550(1.5GHz) : | |
* | $ ./a.out 1024 | |
* | nosched 6486.279785[clk/loop], 6.334258[cyc/data] | |
* | sched 3380.222900[clk/loop], 3.300999[cyc/data] | |
* | read only 5689.654541[clk/loop], 5.556303[cyc/data] | |
* | read only sched 3140.129883[clk/loop], 3.066533[cyc/data] | |
* | nosched-hazard 13649.058105[clk/loop], 13.329158[cyc/data] | |
* | sched-hazard 3381.005127[clk/loop], 3.301763[cyc/data] | |
* | | |
* | $ ./a.out 16384 | |
* | nosched 112515.517090[clk/loop], 6.867402[cyc/data] | |
* | sched 64807.787109[clk/loop], 3.955553[cyc/data] | |
* | read only 90629.773682[clk/loop], 5.531602[cyc/data] | |
* | read only sched 49514.967773[clk/loop], 3.022154[cyc/data] | |
* | nosched-hazard 232532.070557[clk/loop], 14.192631[cyc/data] | |
* | sched-hazard 75640.708740[clk/loop], 4.616742[cyc/data] | |
* | |
* Tegra - Cortex A9(1.0GHz) : | |
* | $ ./a.out 1024 | |
* | nosched 8347.167969[clk/loop], 8.151531[cyc/data] | |
* | sched 3715.332031[clk/loop], 3.628254[cyc/data] | |
* | read only 8360.351562[clk/loop], 8.164406[cyc/data] | |
* | read only sched 3895.507812[clk/loop], 3.804207[cyc/data] | |
* | nosched-hazard 14437.255859[clk/loop], 14.098883[cyc/data] | |
* | sched-hazard 3656.005859[clk/loop], 3.570318[cyc/data] | |
* | | |
* | $ ./a.out 16384 | |
* | nosched 132061.523438[clk/loop], 8.060396[cyc/data] | |
* | sched 57900.146484[clk/loop], 3.533944[cyc/data] | |
* | read only 131998.291016[clk/loop], 8.056536[cyc/data] | |
* | read only sched 61907.226562[clk/loop], 3.778517[cyc/data] | |
* | nosched-hazard 282537.597656[clk/loop], 17.244726[cyc/data] | |
* | sched-hazard 108981.445312[clk/loop], 6.651700[cyc/data] | |
* | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <malloc.h> | |
#if (defined __x86_64__) || (defined __i386__) | |
#include <x86intrin.h> | |
#define cpu_clock() __rdtsc() | |
#define FREG(val) "+x"(val) | |
#else | |
#include <sys/time.h> | |
#define FREG(val) "+w"(val) | |
static inline unsigned long long | |
cpu_clock() | |
{ | |
struct timeval tv; | |
gettimeofday(&tv, NULL); | |
/* @ 1GHz */ | |
return tv.tv_sec*1000000000ULL + tv.tv_usec*1000; | |
} | |
#endif | |
typedef float data_type; | |
static inline data_type | |
f(data_type a) { | |
data_type r = a*1.1f; | |
return r; | |
} | |
static inline data_type | |
f2(data_type a) { | |
data_type r = a*1.1f; | |
asm volatile (" ":FREG(r)); /* macにならないように乗算の結果を使う */ | |
return r; | |
} | |
static void | |
__attribute__((noinline,noclone)) | |
func_nosched(data_type *out, data_type *in, int n) | |
{ | |
int i; | |
for (i=0; i<n; i+=4) { | |
/* load, mul, store */ | |
out[i+0] = f(in[i+0]); | |
out[i+1] = f(in[i+1]); | |
out[i+2] = f(in[i+2]); | |
out[i+3] = f(in[i+3]); | |
} | |
} | |
static data_type | |
__attribute__((noinline,noclone)) | |
func_read_only(data_type *out, data_type *in, int n) | |
{ | |
int i; | |
data_type sum0 = 0; | |
data_type sum1 = 0; | |
data_type sum2 = 0; | |
data_type sum3 = 0; | |
for (i=0; i<n; i+=4) { | |
/* load, mul, add */ | |
sum0 += f2(in[i+0]); | |
asm volatile (" " :FREG(sum0):: "memory"); | |
sum1 += f2(in[i+1]); | |
asm volatile (" " :FREG(sum1):: "memory"); | |
sum2 += f2(in[i+2]); | |
asm volatile (" " :FREG(sum2):: "memory"); | |
sum3 += f2(in[i+3]); | |
asm volatile (" " :FREG(sum3):: "memory"); | |
} | |
return sum0 + sum1 + sum2 + sum3; | |
} | |
static data_type | |
__attribute__((noinline,noclone)) | |
func_read_only_no_output_dep(data_type *out, data_type *in, int n) | |
{ | |
int i; | |
data_type sum0 = 0; | |
data_type sum1 = 0; | |
data_type sum2 = 0; | |
data_type sum3 = 0; | |
data_type tmp0, tmp1, tmp2, tmp3; | |
for (i=0; i<n; i+=4) { | |
/* load, mul, add */ | |
asm volatile ("" | |
"fcpys %0, %8\n\t" | |
"fmuls %6, %0, %8\n\t" | |
"fadds %1, %1, %6\n\t" | |
"fcpys %2, %8\n\t" | |
"fmuls %7, %2, %8\n\t" | |
"fadds %3, %3, %7\n\t" | |
"fcpys %4, %8\n\t" | |
"fmuls %0, %4, %8\n\t" | |
"fadds %5, %5, %0\n\t" | |
"fcpys %6, %8\n\t" | |
"fmuls %2, %6, %8\n\t" | |
"fadds %7, %7, %2\n\t" | |
:"=w"(tmp0), | |
"+w"(sum0), | |
"=w"(tmp1), | |
"+w"(sum1), | |
"=w"(tmp2), | |
"+w"(sum2), | |
"=w"(tmp3), | |
"+w"(sum3) /* %7 */ | |
:"w"(1.1f)); | |
} | |
return sum0 + sum1 + sum2 + sum3; | |
} | |
static data_type | |
__attribute__((noinline,noclone)) | |
func_read_only_output_dep(data_type *out, data_type *in, int n) | |
{ | |
int i; | |
data_type sum0 = 0; | |
data_type sum1 = 0; | |
data_type sum2 = 0; | |
data_type sum3 = 0; | |
data_type tmp0, tmp1, tmp2, tmp3; | |
for (i=0; i<n; i+=4) { | |
/* load, mul, add */ | |
asm volatile ("" | |
"fcpys %0, %8\n\t" | |
"fmuls %0, %0, %8\n\t" | |
"fadds %1, %1, %0\n\t" | |
"fcpys %0, %8\n\t" | |
"fmuls %0, %0, %8\n\t" | |
"fadds %3, %3, %0\n\t" | |
"fcpys %0, %8\n\t" | |
"fmuls %0, %0, %8\n\t" | |
"fadds %5, %5, %0\n\t" | |
"fcpys %0, %8\n\t" | |
"fmuls %0, %0, %8\n\t" | |
"fadds %7, %7, %0\n\t" | |
:"=w"(tmp0), | |
"+w"(sum0), | |
"=w"(tmp1), | |
"+w"(sum1), | |
"=w"(tmp2), | |
"+w"(sum2), | |
"=w"(tmp3), | |
"+w"(sum3) /* %7 */ | |
:"w"(1.1f)); | |
} | |
return sum0 + sum1 + sum2 + sum3; | |
} | |
static data_type | |
__attribute__((noinline,noclone)) | |
func_read_only_sched(data_type *out, data_type *in, int n) | |
{ | |
int i; | |
data_type sum0 = 0; | |
data_type sum1 = 0; | |
data_type sum2 = 0; | |
data_type sum3 = 0; | |
for (i=0; i<n; i+=4) { | |
/* load, mul, add */ | |
data_type v0 = f2(in[i+0]); | |
data_type v1 = f2(in[i+1]); | |
data_type v2 = f2(in[i+2]); | |
data_type v3 = f2(in[i+3]); | |
sum0 += v0; | |
sum1 += v1; | |
sum2 += v2; | |
sum3 += v3; | |
} | |
return sum0 + sum1 + sum2 + sum3; | |
} | |
static void | |
__attribute__((noinline,noclone)) | |
func_sched(data_type *out, data_type *in, int n) | |
{ | |
int i; | |
for (i=0; i<n; i+=4) { | |
/* load, mul, store */ | |
data_type f0 = in[i+0]; | |
data_type f1 = in[i+1]; | |
data_type f2 = in[i+2]; | |
data_type f3 = in[i+3]; | |
out[i+0] = f(f0); | |
out[i+1] = f(f1); | |
out[i+2] = f(f2); | |
out[i+3] = f(f3); | |
} | |
} | |
int | |
main(int argc, char **argv) | |
{ | |
unsigned long long b, e; | |
int i; | |
int nloop = 1024*4; | |
int ndata = 1024; | |
data_type *in; | |
data_type *out; | |
if (argc > 1) { | |
ndata = atoi(argv[1]); | |
} | |
in = memalign(128, sizeof(data_type) * ndata); | |
out = memalign(128, sizeof(data_type) * ndata); | |
func_nosched(out, in, ndata); | |
func_sched(out, in, ndata); | |
/* 依存無しスケジュール無し */ | |
b = cpu_clock(); | |
for (i=0; i<nloop; i++) { | |
func_nosched(out, in, ndata); | |
} | |
e = cpu_clock(); | |
printf("nosched %f[clk/loop], %f[cyc/data]\n", | |
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata)); | |
/* 依存無し手スケジュール */ | |
b = cpu_clock(); | |
for (i=0; i<nloop; i++) { | |
func_sched(out, in, ndata); | |
} | |
e = cpu_clock(); | |
printf("sched %f[clk/loop], %f[cyc/data]\n", | |
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata)); | |
/* スケジュール無しreadonly */ | |
b = cpu_clock(); | |
for (i=0; i<nloop; i++) { | |
func_read_only(out, in, ndata); | |
} | |
e = cpu_clock(); | |
printf("read only %f[clk/loop], %f[cyc/data]\n", | |
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata)); | |
/* スケジュール有りreadonly */ | |
b = cpu_clock(); | |
for (i=0; i<nloop; i++) { | |
func_read_only_sched(out, in, ndata); | |
} | |
e = cpu_clock(); | |
printf("read only sched %f[clk/loop], %f[cyc/data]\n", | |
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata)); | |
/* 出力依存無しreadonly */ | |
b = cpu_clock(); | |
for (i=0; i<nloop; i++) { | |
func_read_only_no_output_dep(out, in, ndata); | |
} | |
e = cpu_clock(); | |
printf("read only no output dep %f[clk/loop], %f[cyc/data]\n", | |
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata)); | |
/* 出力依存有りreadonly */ | |
b = cpu_clock(); | |
for (i=0; i<nloop; i++) { | |
func_read_only_output_dep(out, in, ndata); | |
} | |
e = cpu_clock(); | |
printf("read only output dep %f[clk/loop], %f[cyc/data]\n", | |
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata)); | |
/* 依存あり */ | |
b = cpu_clock(); | |
for (i=0; i<nloop; i++) { | |
func_nosched(in+1, in, ndata); | |
} | |
e = cpu_clock(); | |
printf("nosched-hazard %f[clk/loop], %f[cyc/data]\n", | |
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata)); | |
/* 依存ブチ壊し(結果変わるのでよくないです) */ | |
b = cpu_clock(); | |
for (i=0; i<nloop; i++) { | |
func_sched(in+1, in, ndata); | |
} | |
e = cpu_clock(); | |
printf("sched-hazard %f[clk/loop], %f[cyc/data]\n", | |
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment