/unroll-dep.c

## unroll-dep.c
/*
 *
 * 結論:
 * Cortex-A9 は Out-of-Order だが、手スケジューリングが効く。
 * (というか、Cortex-A9はOut-of-Orderに見えない)
 *
 * どのプロセッサでも直線アクセスだとL2レイテンシは見えない。
 * (ちゃんとHWプリフェッチが効いてる)
 *
 *
 * E350 :
 *  メモリもリオーダーするOut-of-Orderなので
 *  手でスケジューリングしても全く意味が無い
 *
 * Atom :
 *  In-Order。手でスケジューリングするとそれなりに効く
 *
 * Cortex-A9 :
 *  Out-of-Orderらしいが、read only でも手でスケジューリングするのが効いてるので謎
 *  手でスケジューリングすると、Atom以上に効いてる。ように見える
 *
 *
 * 計測方法:
 *  分岐命令の数はとりあえず忘れて、
 *  4回アンロールして、
 *    - 手でスケジューリングしたもの
 *    - ただのアンロール
 *  を比較して、アンロールがどの程度効果あるか調べる
 *
 * nosched :
 *   out[i+0] = in[i+0] * 1.1f;
 *   out[i+1] = in[i+1] * 1.1f;
 *   out[i+2] = in[i+2] * 1.1f;
 *   out[i+3] = in[i+3] * 1.1f;
 *
 * sched :
 *   v0 = in[i+0] * 1.1f;
 *   v1 = in[i+1] * 1.1f;
 *   v2 = in[i+2] * 1.1f;
 *   v3 = in[i+3] * 1.1f;
 *   out[i+0] = v0 * 1.1f;
 *   out[i+1] = v1 * 1.1f;
 *   out[i+2] = v2 * 1.1f;
 *   out[i+3] = v3 * 1.1f;
 *
 * read only:
 *   sum0 += in[i+0] * 1.1f;
 *   sum1 += in[i+1] * 1.1f;
 *   sum2 += in[i+2] * 1.1f;
 *   sum3 += in[i+3] * 1.1f;
 *
 * read only sched:
 *   v0 = in[i+0] * 1.1f;
 *   v1 = in[i+1] * 1.1f;
 *   v2 = in[i+2] * 1.1f;
 *   v3 = in[i+3] * 1.1f;
 *   sum0 += v0;
 *   sum1 += v1;
 *   sum2 += v2;
 *   sum3 += v3;
 *
 * nosched-hazard (ふたつのポインタがオーバーラップして一個前の演算に依存する):
 *   p0 = in
 *   p1 = p1+1
 *
 *   p1[i+0] = p0[i+0] * 1.1f;
 *   p1[i+1] = p0[i+1] * 1.1f;
 *   p1[i+2] = p0[i+2] * 1.1f;
 *   p1[i+3] = p0[i+3] * 1.1f;
 *
 *
 * sched-hazard (ふたつのポインタがオーバーラップしてるけど気にしないで手でスケジューリング):
 *   p0 = in
 *   p1 = p1+1
 *
 *   v0 = p0[i+0] * 1.1f;
 *   v1 = p0[i+1] * 1.1f;
 *   v2 = p0[i+2] * 1.1f;
 *   v3 = p0[i+3] * 1.1f;
 *
 *   p1[i+0] = v0 * 1.1f;
 *   p1[i+1] = v1 * 1.1f;
 *   p1[i+2] = v2 * 1.1f;
 *   p1[i+3] = v3 * 1.1f;
 *
 *
 *
 *
 *
 * 結果:
 * E350 (1.6GHz) :
 * | $ ./a.out 1024
 * | nosched 3108.794678[clk/loop], 3.035932[cyc/data]
 * | sched 3100.598877[clk/loop], 3.027929[cyc/data]
 * | read only 2624.018555[clk/loop], 2.562518[cyc/data]
 * | read only sched 2855.861572[clk/loop], 2.788927[cyc/data]
 * | nosched-hazard 14360.824951[clk/loop], 14.024243[cyc/data]
 * | sched-hazard 3122.873291[clk/loop], 3.049681[cyc/data]
 * |
 * | $ ./a.out 16384
 * | nosched 49393.120605[clk/loop], 3.014717[cyc/data]
 * | sched 50288.229492[clk/loop], 3.069350[cyc/data]
 * | read only 41004.123779[clk/loop], 2.502693[cyc/data]
 * | read only sched 45133.377686[clk/loop], 2.754723[cyc/data]
 * | nosched-hazard 231441.703857[clk/loop], 14.126081[cyc/data]
 * | sched-hazard 50742.266602[clk/loop], 3.097062[cyc/data]
 *
 * Atom N550(1.5GHz) :
 * | $ ./a.out 1024
 * | nosched 6486.279785[clk/loop], 6.334258[cyc/data]
 * | sched 3380.222900[clk/loop], 3.300999[cyc/data]
 * | read only 5689.654541[clk/loop], 5.556303[cyc/data]
 * | read only sched 3140.129883[clk/loop], 3.066533[cyc/data]
 * | nosched-hazard 13649.058105[clk/loop], 13.329158[cyc/data]
 * | sched-hazard 3381.005127[clk/loop], 3.301763[cyc/data]
 * |
 * | $ ./a.out 16384
 * | nosched 112515.517090[clk/loop], 6.867402[cyc/data]
 * | sched 64807.787109[clk/loop], 3.955553[cyc/data]
 * | read only 90629.773682[clk/loop], 5.531602[cyc/data]
 * | read only sched 49514.967773[clk/loop], 3.022154[cyc/data]
 * | nosched-hazard 232532.070557[clk/loop], 14.192631[cyc/data]
 * | sched-hazard 75640.708740[clk/loop], 4.616742[cyc/data]
 *
 * Tegra - Cortex A9(1.0GHz) :
 * | $ ./a.out 1024
 * | nosched 8347.167969[clk/loop], 8.151531[cyc/data]
 * | sched 3715.332031[clk/loop], 3.628254[cyc/data]
 * | read only 8360.351562[clk/loop], 8.164406[cyc/data]
 * | read only sched 3895.507812[clk/loop], 3.804207[cyc/data]
 * | nosched-hazard 14437.255859[clk/loop], 14.098883[cyc/data]
 * | sched-hazard 3656.005859[clk/loop], 3.570318[cyc/data]
 * |
 * | $ ./a.out 16384
 * | nosched 132061.523438[clk/loop], 8.060396[cyc/data]
 * | sched 57900.146484[clk/loop], 3.533944[cyc/data]
 * | read only 131998.291016[clk/loop], 8.056536[cyc/data]
 * | read only sched 61907.226562[clk/loop], 3.778517[cyc/data]
 * | nosched-hazard 282537.597656[clk/loop], 17.244726[cyc/data]
 * | sched-hazard 108981.445312[clk/loop], 6.651700[cyc/data]
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>

#if (defined __x86_64__) || (defined __i386__)
#include <x86intrin.h>
#define cpu_clock() __rdtsc()
#define FREG(val) "+x"(val)
#else
#include <sys/time.h>
#define FREG(val) "+w"(val)

static inline unsigned long long
cpu_clock()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    /* @ 1GHz */
    return tv.tv_sec*1000000000ULL +  tv.tv_usec*1000;
}
#endif

typedef float data_type;

static inline data_type
f(data_type a) {
    data_type r = a*1.1f;
    return r;
}

static inline data_type
f2(data_type a) {
    data_type r = a*1.1f;
    asm volatile (" ":FREG(r)); /* macにならないように乗算の結果を使う */
    return r;
}

static void
__attribute__((noinline,noclone))
func_nosched(data_type *out, data_type *in, int n)
{
    int i;
    for (i=0; i<n; i+=4) {
        /* load, mul, store */
        out[i+0] = f(in[i+0]);
        out[i+1] = f(in[i+1]);
        out[i+2] = f(in[i+2]);
        out[i+3] = f(in[i+3]);
    }
}

static data_type
__attribute__((noinline,noclone))
func_read_only(data_type *out, data_type *in, int n)
{
    int i;
    data_type sum0 = 0;
    data_type sum1 = 0;
    data_type sum2 = 0;
    data_type sum3 = 0;
    for (i=0; i<n; i+=4) {
        /* load, mul, add */
        sum0 += f2(in[i+0]);
        asm volatile (" " :FREG(sum0):: "memory");
        sum1 += f2(in[i+1]);
        asm volatile (" " :FREG(sum1):: "memory");
        sum2 += f2(in[i+2]);
        asm volatile (" " :FREG(sum2):: "memory");
        sum3 += f2(in[i+3]);
        asm volatile (" " :FREG(sum3):: "memory");
    }
    return sum0 + sum1 + sum2 + sum3;
}

static data_type
__attribute__((noinline,noclone))
func_read_only_no_output_dep(data_type *out, data_type *in, int n)
{
    int i;
    data_type sum0 = 0;
    data_type sum1 = 0;
    data_type sum2 = 0;
    data_type sum3 = 0;

    data_type tmp0, tmp1, tmp2, tmp3;

    for (i=0; i<n; i+=4) {
        /* load, mul, add */
        asm volatile (""
                      "fcpys %0, %8\n\t"
                      "fmuls %6, %0, %8\n\t"
                      "fadds %1, %1, %6\n\t"

                      "fcpys %2, %8\n\t"
                      "fmuls %7, %2, %8\n\t"
                      "fadds %3, %3, %7\n\t"

                      "fcpys %4, %8\n\t"
                      "fmuls %0, %4, %8\n\t"
                      "fadds %5, %5, %0\n\t"

                      "fcpys %6, %8\n\t"
                      "fmuls %2, %6, %8\n\t"
                      "fadds %7, %7, %2\n\t"

                      :"=w"(tmp0),
                       "+w"(sum0),
                       "=w"(tmp1),
                       "+w"(sum1),
                       "=w"(tmp2),
                       "+w"(sum2),
                       "=w"(tmp3),
                       "+w"(sum3) /* %7 */

                      :"w"(1.1f));
    }
    return sum0 + sum1 + sum2 + sum3;
}

static data_type
__attribute__((noinline,noclone))
func_read_only_output_dep(data_type *out, data_type *in, int n)
{
    int i;
    data_type sum0 = 0;
    data_type sum1 = 0;
    data_type sum2 = 0;
    data_type sum3 = 0;

    data_type tmp0, tmp1, tmp2, tmp3;

    for (i=0; i<n; i+=4) {
        /* load, mul, add */
        asm volatile (""
                      "fcpys %0, %8\n\t"
                      "fmuls %0, %0, %8\n\t"
                      "fadds %1, %1, %0\n\t"

                      "fcpys %0, %8\n\t"
                      "fmuls %0, %0, %8\n\t"
                      "fadds %3, %3, %0\n\t"

                      "fcpys %0, %8\n\t"
                      "fmuls %0, %0, %8\n\t"
                      "fadds %5, %5, %0\n\t"

                      "fcpys %0, %8\n\t"
                      "fmuls %0, %0, %8\n\t"
                      "fadds %7, %7, %0\n\t"

                      :"=w"(tmp0),
                       "+w"(sum0),
                       "=w"(tmp1),
                       "+w"(sum1),
                       "=w"(tmp2),
                       "+w"(sum2),
                       "=w"(tmp3),
                       "+w"(sum3) /* %7 */

                      :"w"(1.1f));
    }
    return sum0 + sum1 + sum2 + sum3;
}

static data_type
__attribute__((noinline,noclone))
func_read_only_sched(data_type *out, data_type *in, int n)
{
    int i;
    data_type sum0 = 0;
    data_type sum1 = 0;
    data_type sum2 = 0;
    data_type sum3 = 0;
    for (i=0; i<n; i+=4) {
        /* load, mul, add */
        data_type v0 = f2(in[i+0]);
        data_type v1 = f2(in[i+1]);
        data_type v2 = f2(in[i+2]);
        data_type v3 = f2(in[i+3]);

        sum0 += v0;
        sum1 += v1;
        sum2 += v2;
        sum3 += v3;
    }
    return sum0 + sum1 + sum2 + sum3;
}


static void
__attribute__((noinline,noclone))
func_sched(data_type *out, data_type *in, int n)
{
    int i;
    for (i=0; i<n; i+=4) {
        /* load, mul, store */
        data_type f0 = in[i+0];
        data_type f1 = in[i+1];
        data_type f2 = in[i+2];
        data_type f3 = in[i+3];

        out[i+0] = f(f0);
        out[i+1] = f(f1);
        out[i+2] = f(f2);
        out[i+3] = f(f3);
    }
}

int
main(int argc, char **argv)
{
    unsigned long long b, e;
    int i;

    int nloop = 1024*4;
    int ndata = 1024;

    data_type *in;
    data_type *out;

    if (argc > 1) {
        ndata = atoi(argv[1]);
    }

    in = memalign(128, sizeof(data_type) * ndata);
    out = memalign(128, sizeof(data_type) * ndata);

    func_nosched(out, in, ndata);
    func_sched(out, in, ndata);


    /* 依存無しスケジュール無し */
    b = cpu_clock();
    for (i=0; i<nloop; i++) {
        func_nosched(out, in, ndata);
    }
    e = cpu_clock();
    printf("nosched %f[clk/loop], %f[cyc/data]\n",
           (e-b)/(double)nloop, (e-b)/((double)nloop*ndata));

    /* 依存無し手スケジュール */
    b = cpu_clock();
    for (i=0; i<nloop; i++) {
        func_sched(out, in, ndata);
    }
    e = cpu_clock();
    printf("sched %f[clk/loop], %f[cyc/data]\n",
           (e-b)/(double)nloop, (e-b)/((double)nloop*ndata));

    /* スケジュール無しreadonly */
    b = cpu_clock();
    for (i=0; i<nloop; i++) {
        func_read_only(out, in, ndata);
    }
    e = cpu_clock();
    printf("read only %f[clk/loop], %f[cyc/data]\n",
           (e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
    /* スケジュール有りreadonly */
    b = cpu_clock();
    for (i=0; i<nloop; i++) {
        func_read_only_sched(out, in, ndata);
    }
    e = cpu_clock();
    printf("read only sched %f[clk/loop], %f[cyc/data]\n",
           (e-b)/(double)nloop, (e-b)/((double)nloop*ndata));

    /* 出力依存無しreadonly */
    b = cpu_clock();
    for (i=0; i<nloop; i++) {
        func_read_only_no_output_dep(out, in, ndata);
    }
    e = cpu_clock();
    printf("read only no output dep %f[clk/loop], %f[cyc/data]\n",
           (e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
    /* 出力依存有りreadonly */
    b = cpu_clock();
    for (i=0; i<nloop; i++) {
        func_read_only_output_dep(out, in, ndata);
    }
    e = cpu_clock();
    printf("read only output dep %f[clk/loop], %f[cyc/data]\n",
           (e-b)/(double)nloop, (e-b)/((double)nloop*ndata));


    /* 依存あり */
    b = cpu_clock();
    for (i=0; i<nloop; i++) {
        func_nosched(in+1, in, ndata);
    }
    e = cpu_clock();
    printf("nosched-hazard %f[clk/loop], %f[cyc/data]\n",
           (e-b)/(double)nloop, (e-b)/((double)nloop*ndata));


    /* 依存ブチ壊し(結果変わるのでよくないです) */
    b = cpu_clock();
    for (i=0; i<nloop; i++) {
        func_sched(in+1, in, ndata);
    }
    e = cpu_clock();
    printf("sched-hazard %f[clk/loop], %f[cyc/data]\n",
           (e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
}
	/*
	*
	* 結論:
	* Cortex-A9 は Out-of-Order だが、手スケジューリングが効く。
	* (というか、Cortex-A9はOut-of-Orderに見えない)
	*
	* どのプロセッサでも直線アクセスだとL2レイテンシは見えない。
	* (ちゃんとHWプリフェッチが効いてる)
	*
	*
	* E350 :
	* メモリもリオーダーするOut-of-Orderなので
	* 手でスケジューリングしても全く意味が無い
	*
	* Atom :
	* In-Order。手でスケジューリングするとそれなりに効く
	*
	* Cortex-A9 :
	* Out-of-Orderらしいが、read only でも手でスケジューリングするのが効いてるので謎
	* 手でスケジューリングすると、Atom以上に効いてる。ように見える
	*
	*
	* 計測方法:
	* 分岐命令の数はとりあえず忘れて、
	* 4回アンロールして、
	* - 手でスケジューリングしたもの
	* - ただのアンロール
	* を比較して、アンロールがどの程度効果あるか調べる
	*
	* nosched :
	* out[i+0] = in[i+0] * 1.1f;
	* out[i+1] = in[i+1] * 1.1f;
	* out[i+2] = in[i+2] * 1.1f;
	* out[i+3] = in[i+3] * 1.1f;
	*
	* sched :
	* v0 = in[i+0] * 1.1f;
	* v1 = in[i+1] * 1.1f;
	* v2 = in[i+2] * 1.1f;
	* v3 = in[i+3] * 1.1f;
	* out[i+0] = v0 * 1.1f;
	* out[i+1] = v1 * 1.1f;
	* out[i+2] = v2 * 1.1f;
	* out[i+3] = v3 * 1.1f;
	*
	* read only:
	* sum0 += in[i+0] * 1.1f;
	* sum1 += in[i+1] * 1.1f;
	* sum2 += in[i+2] * 1.1f;
	* sum3 += in[i+3] * 1.1f;
	*
	* read only sched:
	* v0 = in[i+0] * 1.1f;
	* v1 = in[i+1] * 1.1f;
	* v2 = in[i+2] * 1.1f;
	* v3 = in[i+3] * 1.1f;
	* sum0 += v0;
	* sum1 += v1;
	* sum2 += v2;
	* sum3 += v3;
	*
	* nosched-hazard (ふたつのポインタがオーバーラップして一個前の演算に依存する):
	* p0 = in
	* p1 = p1+1
	*
	* p1[i+0] = p0[i+0] * 1.1f;
	* p1[i+1] = p0[i+1] * 1.1f;
	* p1[i+2] = p0[i+2] * 1.1f;
	* p1[i+3] = p0[i+3] * 1.1f;
	*
	*
	* sched-hazard (ふたつのポインタがオーバーラップしてるけど気にしないで手でスケジューリング):
	* p0 = in
	* p1 = p1+1
	*
	* v0 = p0[i+0] * 1.1f;
	* v1 = p0[i+1] * 1.1f;
	* v2 = p0[i+2] * 1.1f;
	* v3 = p0[i+3] * 1.1f;
	*
	* p1[i+0] = v0 * 1.1f;
	* p1[i+1] = v1 * 1.1f;
	* p1[i+2] = v2 * 1.1f;
	* p1[i+3] = v3 * 1.1f;
	*
	*
	*
	*
	*
	* 結果:
	* E350 (1.6GHz) :
	* \| $ ./a.out 1024
	* \| nosched 3108.794678[clk/loop], 3.035932[cyc/data]
	* \| sched 3100.598877[clk/loop], 3.027929[cyc/data]
	* \| read only 2624.018555[clk/loop], 2.562518[cyc/data]
	* \| read only sched 2855.861572[clk/loop], 2.788927[cyc/data]
	* \| nosched-hazard 14360.824951[clk/loop], 14.024243[cyc/data]
	* \| sched-hazard 3122.873291[clk/loop], 3.049681[cyc/data]
	* \|
	* \| $ ./a.out 16384
	* \| nosched 49393.120605[clk/loop], 3.014717[cyc/data]
	* \| sched 50288.229492[clk/loop], 3.069350[cyc/data]
	* \| read only 41004.123779[clk/loop], 2.502693[cyc/data]
	* \| read only sched 45133.377686[clk/loop], 2.754723[cyc/data]
	* \| nosched-hazard 231441.703857[clk/loop], 14.126081[cyc/data]
	* \| sched-hazard 50742.266602[clk/loop], 3.097062[cyc/data]
	*
	* Atom N550(1.5GHz) :
	* \| $ ./a.out 1024
	* \| nosched 6486.279785[clk/loop], 6.334258[cyc/data]
	* \| sched 3380.222900[clk/loop], 3.300999[cyc/data]
	* \| read only 5689.654541[clk/loop], 5.556303[cyc/data]
	* \| read only sched 3140.129883[clk/loop], 3.066533[cyc/data]
	* \| nosched-hazard 13649.058105[clk/loop], 13.329158[cyc/data]
	* \| sched-hazard 3381.005127[clk/loop], 3.301763[cyc/data]
	* \|
	* \| $ ./a.out 16384
	* \| nosched 112515.517090[clk/loop], 6.867402[cyc/data]
	* \| sched 64807.787109[clk/loop], 3.955553[cyc/data]
	* \| read only 90629.773682[clk/loop], 5.531602[cyc/data]
	* \| read only sched 49514.967773[clk/loop], 3.022154[cyc/data]
	* \| nosched-hazard 232532.070557[clk/loop], 14.192631[cyc/data]
	* \| sched-hazard 75640.708740[clk/loop], 4.616742[cyc/data]
	*
	* Tegra - Cortex A9(1.0GHz) :
	* \| $ ./a.out 1024
	* \| nosched 8347.167969[clk/loop], 8.151531[cyc/data]
	* \| sched 3715.332031[clk/loop], 3.628254[cyc/data]
	* \| read only 8360.351562[clk/loop], 8.164406[cyc/data]
	* \| read only sched 3895.507812[clk/loop], 3.804207[cyc/data]
	* \| nosched-hazard 14437.255859[clk/loop], 14.098883[cyc/data]
	* \| sched-hazard 3656.005859[clk/loop], 3.570318[cyc/data]
	* \|
	* \| $ ./a.out 16384
	* \| nosched 132061.523438[clk/loop], 8.060396[cyc/data]
	* \| sched 57900.146484[clk/loop], 3.533944[cyc/data]
	* \| read only 131998.291016[clk/loop], 8.056536[cyc/data]
	* \| read only sched 61907.226562[clk/loop], 3.778517[cyc/data]
	* \| nosched-hazard 282537.597656[clk/loop], 17.244726[cyc/data]
	* \| sched-hazard 108981.445312[clk/loop], 6.651700[cyc/data]
	*
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <malloc.h>

	#if (defined __x86_64__) \|\| (defined __i386__)
	#include <x86intrin.h>
	#define cpu_clock() __rdtsc()
	#define FREG(val) "+x"(val)
	#else
	#include <sys/time.h>
	#define FREG(val) "+w"(val)

	static inline unsigned long long
	cpu_clock()
	{
	struct timeval tv;
	gettimeofday(&tv, NULL);
	/* @ 1GHz */
	return tv.tv_sec1000000000ULL + tv.tv_usec1000;
	}
	#endif

	typedef float data_type;

	static inline data_type
	f(data_type a) {
	data_type r = a*1.1f;
	return r;
	}

	static inline data_type
	f2(data_type a) {
	data_type r = a*1.1f;
	asm volatile (" ":FREG(r)); /* macにならないように乗算の結果を使う */
	return r;
	}

	static void
	__attribute__((noinline,noclone))
	func_nosched(data_type out, data_type in, int n)
	{
	int i;
	for (i=0; i<n; i+=4) {
	/* load, mul, store */
	out[i+0] = f(in[i+0]);
	out[i+1] = f(in[i+1]);
	out[i+2] = f(in[i+2]);
	out[i+3] = f(in[i+3]);
	}
	}

	static data_type
	__attribute__((noinline,noclone))
	func_read_only(data_type out, data_type in, int n)
	{
	int i;
	data_type sum0 = 0;
	data_type sum1 = 0;
	data_type sum2 = 0;
	data_type sum3 = 0;
	for (i=0; i<n; i+=4) {
	/* load, mul, add */
	sum0 += f2(in[i+0]);
	asm volatile (" " :FREG(sum0):: "memory");
	sum1 += f2(in[i+1]);
	asm volatile (" " :FREG(sum1):: "memory");
	sum2 += f2(in[i+2]);
	asm volatile (" " :FREG(sum2):: "memory");
	sum3 += f2(in[i+3]);
	asm volatile (" " :FREG(sum3):: "memory");
	}
	return sum0 + sum1 + sum2 + sum3;
	}

	static data_type
	__attribute__((noinline,noclone))
	func_read_only_no_output_dep(data_type out, data_type in, int n)
	{
	int i;
	data_type sum0 = 0;
	data_type sum1 = 0;
	data_type sum2 = 0;
	data_type sum3 = 0;

	data_type tmp0, tmp1, tmp2, tmp3;

	for (i=0; i<n; i+=4) {
	/* load, mul, add */
	asm volatile (""
	"fcpys %0, %8\n\t"
	"fmuls %6, %0, %8\n\t"
	"fadds %1, %1, %6\n\t"

	"fcpys %2, %8\n\t"
	"fmuls %7, %2, %8\n\t"
	"fadds %3, %3, %7\n\t"

	"fcpys %4, %8\n\t"
	"fmuls %0, %4, %8\n\t"
	"fadds %5, %5, %0\n\t"

	"fcpys %6, %8\n\t"
	"fmuls %2, %6, %8\n\t"
	"fadds %7, %7, %2\n\t"

	:"=w"(tmp0),
	"+w"(sum0),
	"=w"(tmp1),
	"+w"(sum1),
	"=w"(tmp2),
	"+w"(sum2),
	"=w"(tmp3),
	"+w"(sum3) /* %7 */

	:"w"(1.1f));
	}
	return sum0 + sum1 + sum2 + sum3;
	}

	static data_type
	__attribute__((noinline,noclone))
	func_read_only_output_dep(data_type out, data_type in, int n)
	{
	int i;
	data_type sum0 = 0;
	data_type sum1 = 0;
	data_type sum2 = 0;
	data_type sum3 = 0;

	data_type tmp0, tmp1, tmp2, tmp3;

	for (i=0; i<n; i+=4) {
	/* load, mul, add */
	asm volatile (""
	"fcpys %0, %8\n\t"
	"fmuls %0, %0, %8\n\t"
	"fadds %1, %1, %0\n\t"

	"fcpys %0, %8\n\t"
	"fmuls %0, %0, %8\n\t"
	"fadds %3, %3, %0\n\t"

	"fcpys %0, %8\n\t"
	"fmuls %0, %0, %8\n\t"
	"fadds %5, %5, %0\n\t"

	"fcpys %0, %8\n\t"
	"fmuls %0, %0, %8\n\t"
	"fadds %7, %7, %0\n\t"

	:"=w"(tmp0),
	"+w"(sum0),
	"=w"(tmp1),
	"+w"(sum1),
	"=w"(tmp2),
	"+w"(sum2),
	"=w"(tmp3),
	"+w"(sum3) /* %7 */

	:"w"(1.1f));
	}
	return sum0 + sum1 + sum2 + sum3;
	}

	static data_type
	__attribute__((noinline,noclone))
	func_read_only_sched(data_type out, data_type in, int n)
	{
	int i;
	data_type sum0 = 0;
	data_type sum1 = 0;
	data_type sum2 = 0;
	data_type sum3 = 0;
	for (i=0; i<n; i+=4) {
	/* load, mul, add */
	data_type v0 = f2(in[i+0]);
	data_type v1 = f2(in[i+1]);
	data_type v2 = f2(in[i+2]);
	data_type v3 = f2(in[i+3]);

	sum0 += v0;
	sum1 += v1;
	sum2 += v2;
	sum3 += v3;
	}
	return sum0 + sum1 + sum2 + sum3;
	}


	static void
	__attribute__((noinline,noclone))
	func_sched(data_type out, data_type in, int n)
	{
	int i;
	for (i=0; i<n; i+=4) {
	/* load, mul, store */
	data_type f0 = in[i+0];
	data_type f1 = in[i+1];
	data_type f2 = in[i+2];
	data_type f3 = in[i+3];

	out[i+0] = f(f0);
	out[i+1] = f(f1);
	out[i+2] = f(f2);
	out[i+3] = f(f3);
	}
	}

	int
	main(int argc, char **argv)
	{
	unsigned long long b, e;
	int i;

	int nloop = 1024*4;
	int ndata = 1024;

	data_type *in;
	data_type *out;

	if (argc > 1) {
	ndata = atoi(argv[1]);
	}

	in = memalign(128, sizeof(data_type) * ndata);
	out = memalign(128, sizeof(data_type) * ndata);

	func_nosched(out, in, ndata);
	func_sched(out, in, ndata);


	/* 依存無しスケジュール無し */
	b = cpu_clock();
	for (i=0; i<nloop; i++) {
	func_nosched(out, in, ndata);
	}
	e = cpu_clock();
	printf("nosched %f[clk/loop], %f[cyc/data]\n",
	(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));

	/* 依存無し手スケジュール */
	b = cpu_clock();
	for (i=0; i<nloop; i++) {
	func_sched(out, in, ndata);
	}
	e = cpu_clock();
	printf("sched %f[clk/loop], %f[cyc/data]\n",
	(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));

	/* スケジュール無しreadonly */
	b = cpu_clock();
	for (i=0; i<nloop; i++) {
	func_read_only(out, in, ndata);
	}
	e = cpu_clock();
	printf("read only %f[clk/loop], %f[cyc/data]\n",
	(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
	/* スケジュール有りreadonly */
	b = cpu_clock();
	for (i=0; i<nloop; i++) {
	func_read_only_sched(out, in, ndata);
	}
	e = cpu_clock();
	printf("read only sched %f[clk/loop], %f[cyc/data]\n",
	(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));

	/* 出力依存無しreadonly */
	b = cpu_clock();
	for (i=0; i<nloop; i++) {
	func_read_only_no_output_dep(out, in, ndata);
	}
	e = cpu_clock();
	printf("read only no output dep %f[clk/loop], %f[cyc/data]\n",
	(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
	/* 出力依存有りreadonly */
	b = cpu_clock();
	for (i=0; i<nloop; i++) {
	func_read_only_output_dep(out, in, ndata);
	}
	e = cpu_clock();
	printf("read only output dep %f[clk/loop], %f[cyc/data]\n",
	(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));


	/* 依存あり */
	b = cpu_clock();
	for (i=0; i<nloop; i++) {
	func_nosched(in+1, in, ndata);
	}
	e = cpu_clock();
	printf("nosched-hazard %f[clk/loop], %f[cyc/data]\n",
	(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));


	/* 依存ブチ壊し(結果変わるのでよくないです) */
	b = cpu_clock();
	for (i=0; i<nloop; i++) {
	func_sched(in+1, in, ndata);
	}
	e = cpu_clock();
	printf("sched-hazard %f[clk/loop], %f[cyc/data]\n",
	(e-b)/(double)nloop, (e-b)/((double)nloop*ndata));
	}