Skip to content

Instantly share code, notes, and snippets.

Created December 23, 2011 08:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/1513601 to your computer and use it in GitHub Desktop.
Save anonymous/1513601 to your computer and use it in GitHub Desktop.
/*
*
* 結論:
* Cortex-A9 は Out-of-Order だが、手スケジューリングが効く。
* (というか、Cortex-A9はOut-of-Orderに見えない)
*
* どのプロセッサでも直線アクセスだとL2レイテンシは見えない。
* (ちゃんとHWプリフェッチが効いてる)
*
*
* E350 :
* メモリもリオーダーするOut-of-Orderなので
* 手でスケジューリングしても全く意味が無い
*
* Atom :
* In-Order。手でスケジューリングするとそれなりに効く
*
* Cortex-A9 :
* Out-of-Orderらしいが、read only でも手でスケジューリングするのが効いてるので謎
* 手でスケジューリングすると、Atom以上に効いてる。ように見える
*
*
* 計測方法:
* 分岐命令の数はとりあえず忘れて、
* 4回アンロールして、
* - 手でスケジューリングしたもの
* - ただのアンロール
* を比較して、アンロールがどの程度効果あるか調べる
*
* nosched :
* out[i+0] = in[i+0] * 1.1f;
* out[i+1] = in[i+1] * 1.1f;
* out[i+2] = in[i+2] * 1.1f;
* out[i+3] = in[i+3] * 1.1f;
*
* sched :
* v0 = in[i+0] * 1.1f;
* v1 = in[i+1] * 1.1f;
* v2 = in[i+2] * 1.1f;
* v3 = in[i+3] * 1.1f;
* out[i+0] = v0 * 1.1f;
* out[i+1] = v1 * 1.1f;
* out[i+2] = v2 * 1.1f;
* out[i+3] = v3 * 1.1f;
*
* read only:
* sum0 += in[i+0] * 1.1f;
* sum1 += in[i+1] * 1.1f;
* sum2 += in[i+2] * 1.1f;
* sum3 += in[i+3] * 1.1f;
*
* read only sched:
* v0 = in[i+0] * 1.1f;
* v1 = in[i+1] * 1.1f;
* v2 = in[i+2] * 1.1f;
* v3 = in[i+3] * 1.1f;
* sum0 += v0;
* sum1 += v1;
* sum2 += v2;
* sum3 += v3;
*
* nosched-hazard (ふたつのポインタがオーバーラップして一個前の演算に依存する):
* p0 = in
* p1 = p1+1
*
* p1[i+0] = p0[i+0] * 1.1f;
* p1[i+1] = p0[i+1] * 1.1f;
* p1[i+2] = p0[i+2] * 1.1f;
* p1[i+3] = p0[i+3] * 1.1f;
*
*
* sched-hazard (ふたつのポインタがオーバーラップしてるけど気にしないで手でスケジューリング):
* p0 = in
* p1 = p1+1
*
* v0 = p0[i+0] * 1.1f;
* v1 = p0[i+1] * 1.1f;
* v2 = p0[i+2] * 1.1f;
* v3 = p0[i+3] * 1.1f;
*
* p1[i+0] = v0 * 1.1f;
* p1[i+1] = v1 * 1.1f;
* p1[i+2] = v2 * 1.1f;
* p1[i+3] = v3 * 1.1f;
*
*
*
*
*
* 結果:
* E350 (1.6GHz) :
* | $ ./a.out 1024
* | nosched 3108.794678[clk/loop], 3.035932[cyc/data]
* | sched 3100.598877[clk/loop], 3.027929[cyc/data]
* | read only 2624.018555[clk/loop], 2.562518[cyc/data]
* | read only sched 2855.861572[clk/loop], 2.788927[cyc/data]
* | nosched-hazard 14360.824951[clk/loop], 14.024243[cyc/data]
* | sched-hazard 3122.873291[clk/loop], 3.049681[cyc/data]
* |
* | $ ./a.out 16384
* | nosched 49393.120605[clk/loop], 3.014717[cyc/data]
* | sched 50288.229492[clk/loop], 3.069350[cyc/data]
* | read only 41004.123779[clk/loop], 2.502693[cyc/data]
* | read only sched 45133.377686[clk/loop], 2.754723[cyc/data]
* | nosched-hazard 231441.703857[clk/loop], 14.126081[cyc/data]
* | sched-hazard 50742.266602[clk/loop], 3.097062[cyc/data]
*
* Atom N550(1.5GHz) :
* | $ ./a.out 1024
* | nosched 6486.279785[clk/loop], 6.334258[cyc/data]
* | sched 3380.222900[clk/loop], 3.300999[cyc/data]
* | read only 5689.654541[clk/loop], 5.556303[cyc/data]
* | read only sched 3140.129883[clk/loop], 3.066533[cyc/data]
* | nosched-hazard 13649.058105[clk/loop], 13.329158[cyc/data]
* | sched-hazard 3381.005127[clk/loop], 3.301763[cyc/data]
* |
* | $ ./a.out 16384
* | nosched 112515.517090[clk/loop], 6.867402[cyc/data]
* | sched 64807.787109[clk/loop], 3.955553[cyc/data]
* | read only 90629.773682[clk/loop], 5.531602[cyc/data]
* | read only sched 49514.967773[clk/loop], 3.022154[cyc/data]
* | nosched-hazard 232532.070557[clk/loop], 14.192631[cyc/data]
* | sched-hazard 75640.708740[clk/loop], 4.616742[cyc/data]
*
* Tegra - Cortex A9(1.0GHz) :
* | $ ./a.out 1024
* | nosched 8347.167969[clk/loop], 8.151531[cyc/data]
* | sched 3715.332031[clk/loop], 3.628254[cyc/data]
* | read only 8360.351562[clk/loop], 8.164406[cyc/data]
* | read only sched 3895.507812[clk/loop], 3.804207[cyc/data]
* | nosched-hazard 14437.255859[clk/loop], 14.098883[cyc/data]
* | sched-hazard 3656.005859[clk/loop], 3.570318[cyc/data]
* |
* | $ ./a.out 16384
* | nosched 132061.523438[clk/loop], 8.060396[cyc/data]
* | sched 57900.146484[clk/loop], 3.533944[cyc/data]
* | read only 131998.291016[clk/loop], 8.056536[cyc/data]
* | read only sched 61907.226562[clk/loop], 3.778517[cyc/data]
* | nosched-hazard 282537.597656[clk/loop], 17.244726[cyc/data]
* | sched-hazard 108981.445312[clk/loop], 6.651700[cyc/data]
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#if (defined __x86_64__) || (defined __i386__)
#include <x86intrin.h>
#define cpu_clock() __rdtsc()
#define FREG(val) "+x"(val)
#else
#include <sys/time.h>
#define FREG(val) "+w"(val)
static inline unsigned long long
cpu_clock()
{
struct timeval tv;
gettimeofday(&tv, NULL);
/* @ 1GHz */
return tv.tv_sec*1000000000ULL + tv.tv_usec*1000;
}
#endif
typedef float data_type;
static inline data_type
f(data_type a) {
data_type r = a*1.1f;
return r;
}
static inline data_type
f2(data_type a) {
data_type r = a*1.1f;
asm volatile (" ":FREG(r)); /* macにならないように乗算の結果を使う */
return r;
}
static void
__attribute__((noinline,noclone))
func_nosched(data_type *out, data_type *in, int n)
{
int i;
for (i=0; i<n; i+=4) {
/* load, mul, store */
out[i+0] = f(in[i+0]);
out[i+1] = f(in[i+1]);
out[i+2] = f(in[i+2]);
out[i+3] = f(in[i+3]);
}
}
static data_type
__attribute__((noinline,noclone))
func_read_only(data_type *out, data_type *in, int n)
{
int i;
data_type sum0 = 0;
data_type sum1 = 0;
data_type sum2 = 0;
data_type sum3 = 0;
for (i=0; i<n; i+=4) {
/* load, mul, add */
sum0 += f2(in[i+0]);
asm volatile (" " :FREG(sum0):: "memory");
sum1 += f2(in[i+1]);
asm volatile (" " :FREG(sum1):: "memory");
sum2 += f2(in[i+2]);
asm volatile (" " :FREG(sum2):: "memory");
sum3 += f2(in[i+3]);
asm volatile (" " :FREG(sum3):: "memory");
}
return sum0 + sum1 + sum2 + sum3;
}
static data_type
__attribute__((noinline,noclone))
func_read_only_no_output_dep(data_type *out, data_type *in, int n)
{
int i;
data_type sum0 = 0;
data_type sum1 = 0;
data_type sum2 = 0;
data_type sum3 = 0;
data_type tmp0, tmp1, tmp2, tmp3;
for (i=0; i<n; i+=4) {
/* load, mul, add */
asm volatile (""
"fcpys %0, %8\n\t"
"fmuls %6, %0, %8\n\t"
"fadds %1, %1, %6\n\t"
"fcpys %2, %8\n\t"
"fmuls %7, %2, %8\n\t"
"fadds %3, %3, %7\n\t"
"fcpys %4, %8\n\t"
"fmuls %0, %4, %8\n\t"
"fadds %5, %5, %0\n\t"
"fcpys %6, %8\n\t"
"fmuls %2, %6, %8\n\t"
"fadds %7, %7, %2\n\t"
:"=w"(tmp0),
"+w"(sum0),
"=w"(tmp1),
"+w"(sum1),
"=w"(tmp2),
"+w"(sum2),
"=w"(tmp3),
"+w"(sum3) /* %7 */
:"w"(1.1f));
}
return sum0 + sum1 + sum2 + sum3;
}
static data_type
__attribute__((noinline,noclone))
func_read_only_output_dep(data_type *out, data_type *in, int n)
{
int i;
data_type sum0 = 0;
data_type sum1 = 0;
data_type sum2 = 0;
data_type sum3 = 0;
data_type tmp0, tmp1, tmp2, tmp3;
for (i=0; i<n; i+=4) {
/* load, mul, add */
asm volatile (""
"fcpys %0, %8\n\t"
"fmuls %0, %0, %8\n\t"
"fadds %1, %1, %0\n\t"
"fcpys %0, %8\n\t"
"fmuls %0, %0, %8\n\t"
"fadds %3, %3, %0\n\t"
"fcpys %0, %8\n\t"
"fmuls %0, %0, %8\n\t"
"fadds %5, %5, %0\n\t"
"fcpys %0, %8\n\t"
"fmuls %0, %0, %8\n\t"
"fadds %7, %7, %0\n\t"
:"=w"(tmp0),
"+w"(sum0),
"=w"(tmp1),
"+w"(sum1),
"=w"(tmp2),
"+w"(sum2),
"=w"(tmp3),
"+w"(sum3) /* %7 */
:"w"(1.1f));
}
return sum0 + sum1 + sum2 + sum3;
}
static data_type
__attribute__((noinline,noclone))
func_read_only_sched(data_type *out, data_type *in, int n)
{
int i;
data_type sum0 = 0;
data_type sum1 = 0;
data_type sum2 = 0;
data_type sum3 = 0;
for (i=0; i<n; i+=4) {
/* load, mul, add */
data_type v0 = f2(in[i+0]);
data_type v1 = f2(in[i+1]);
data_type v2 = f2(in[i+2]);
data_type v3 = f2(in[i+3]);
sum0 += v0;
sum1 += v1;
sum2 += v2;
sum3 += v3;
}
return sum0 + sum1 + sum2 + sum3;
}
static void
__attribute__((noinline,noclone))
func_sched(data_type *out, data_type *in, int n)
{
int i;
for (i=0; i<n; i+=4) {
/* load, mul, store */
data_type f0 = in[i+0];
data_type f1 = in[i+1];
data_type f2 = in[i+2];
data_type f3 = in[i+3];
out[i+0] = f(f0);
out[i+1] = f(f1);
out[i+2] = f(f2);
out[i+3] = f(f3);
}
}
int
main(int argc, char **argv)
{
unsigned long long b, e;
int i;
int nloop = 1024*4;
int ndata = 1024;
data_type *in;
data_type *out;
if (argc > 1) {
ndata = atoi(argv[1]);
}
in = memalign(128, sizeof(data_type) * ndata);
out = memalign(128, sizeof(data_type) * ndata);
func_nosched(out, in, ndata);
func_sched(out, in, ndata);
/* 依存無しスケジュール無し */
b = cpu_clock();
for (i=0; i<nloop; i++) {
func_nosched(out, in, ndata);
}
e = cpu_clock();
printf("nosched %f[clk/loop], %f[cyc/data]\n",
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
/* 依存無し手スケジュール */
b = cpu_clock();
for (i=0; i<nloop; i++) {
func_sched(out, in, ndata);
}
e = cpu_clock();
printf("sched %f[clk/loop], %f[cyc/data]\n",
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
/* スケジュール無しreadonly */
b = cpu_clock();
for (i=0; i<nloop; i++) {
func_read_only(out, in, ndata);
}
e = cpu_clock();
printf("read only %f[clk/loop], %f[cyc/data]\n",
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
/* スケジュール有りreadonly */
b = cpu_clock();
for (i=0; i<nloop; i++) {
func_read_only_sched(out, in, ndata);
}
e = cpu_clock();
printf("read only sched %f[clk/loop], %f[cyc/data]\n",
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
/* 出力依存無しreadonly */
b = cpu_clock();
for (i=0; i<nloop; i++) {
func_read_only_no_output_dep(out, in, ndata);
}
e = cpu_clock();
printf("read only no output dep %f[clk/loop], %f[cyc/data]\n",
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
/* 出力依存有りreadonly */
b = cpu_clock();
for (i=0; i<nloop; i++) {
func_read_only_output_dep(out, in, ndata);
}
e = cpu_clock();
printf("read only output dep %f[clk/loop], %f[cyc/data]\n",
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
/* 依存あり */
b = cpu_clock();
for (i=0; i<nloop; i++) {
func_nosched(in+1, in, ndata);
}
e = cpu_clock();
printf("nosched-hazard %f[clk/loop], %f[cyc/data]\n",
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
/* 依存ブチ壊し(結果変わるのでよくないです) */
b = cpu_clock();
for (i=0; i<nloop; i++) {
func_sched(in+1, in, ndata);
}
e = cpu_clock();
printf("sched-hazard %f[clk/loop], %f[cyc/data]\n",
(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment