Spasi/FPReduce.java Secret

## FPReduce.java
// double version:

/*
Windows 10, JVM 9.0.4+11, Ryzen 1800X:
--------------------------------------
Benchmark                       (size)   Mode  Cnt     Score     Error   Units
SIMDBenchmark.reduceBuffered      1024  thrpt    3   518,570 ±  27,117  ops/ms
SIMDBenchmark.reduceBuffered     65536  thrpt    3    35,050 ±   0,297  ops/ms
SIMDBenchmark.reduceBuffered    131072  thrpt    3    17,822 ±   1,511  ops/ms
SIMDBenchmark.reduceSimple        1024  thrpt    3  1230,955 ±  24,972  ops/ms
SIMDBenchmark.reduceSimple       65536  thrpt    3    18,628 ±   0,725  ops/ms
SIMDBenchmark.reduceSimple      131072  thrpt    3     9,340 ±   0,217  ops/ms
SIMDBenchmark.reduceUnrolled      1024  thrpt    3  4699,957 ± 121,889  ops/ms
SIMDBenchmark.reduceUnrolled     65536  thrpt    3    73,777 ±   0,889  ops/ms
SIMDBenchmark.reduceUnrolled    131072  thrpt    3    36,505 ±   3,219  ops/ms
SIMDBenchmark.reduceVectorised    1024  thrpt    3   365,681 ±  26,555  ops/ms
SIMDBenchmark.reduceVectorised   65536  thrpt    3    27,931 ±   1,371  ops/ms
SIMDBenchmark.reduceVectorised  131072  thrpt    3    14,286 ±   0,930  ops/ms

Linux, JVM 10+43, Ryzen 1800X:
------------------------------
Benchmark                       (size)   Mode  Cnt     Score    Error   Units
SIMDBenchmark.reduceUnrolled      1024  thrpt    5  4474.529 ± 29.028  ops/ms
SIMDBenchmark.reduceUnrolled     65536  thrpt    5    70.467 ±  1.030  ops/ms
SIMDBenchmark.reduceUnrolled    131072  thrpt    5    35.435 ±  0.293  ops/ms
SIMDBenchmark.reduceVectorised    1024  thrpt    5   381.656 ± 10.818  ops/ms
SIMDBenchmark.reduceVectorised   65536  thrpt    5    36.867 ±  1.764  ops/ms
SIMDBenchmark.reduceVectorised  131072  thrpt    5    18.645 ±  0.546  ops/ms
*/
@Benchmark
public double reduceVectorised() {
    double[] buffer = new double[1024];
    double[] temp   = new double[1024];
    for (int i = 0; i < data.length >>> 10; ++i) {
        System.arraycopy(data, i * 1024, temp, 0, temp.length);
        for (int j = 0; j < 1024; ++j) {
            buffer[j] += temp[j];
        }
    }
    return reduce(buffer);
}

@Benchmark
public double reduceUnrolled() {
    double a0 = 0.0;
    double a1 = 0.0;
    double a2 = 0.0;
    double a3 = 0.0;
    for (int i = 0; i < data.length >> 2; i++) {
        a0 += data[i * 4 + 0];
        a1 += data[i * 4 + 1];
        a2 += data[i * 4 + 2];
        a3 += data[i * 4 + 3];
    }
    return a0 + a1 + a2 + a3;
}

// float version:

/*
Windows 10, JVM 9.0.4+11, Ryzen 1800X:
--------------------------------------
Benchmark                        (size)   Mode  Cnt     Score     Error   Units
SIMDBenchmark.reduceBuffered       1024  thrpt    3   632,734 ± 338,190  ops/ms
SIMDBenchmark.reduceBuffered      65536  thrpt    3    35,742 ±   4,967  ops/ms
SIMDBenchmark.reduceBuffered     131072  thrpt    3    17,894 ±   2,126  ops/ms
SIMDBenchmark.reduceSimple         1024  thrpt    3  1227,272 ±  90,193  ops/ms
SIMDBenchmark.reduceSimple        65536  thrpt    3    18,640 ±   0,616  ops/ms
SIMDBenchmark.reduceSimple       131072  thrpt    3     9,334 ±   0,171  ops/ms
SIMDBenchmark.reduceUnrolled128    1024  thrpt    3  4695,636 ±  98,984  ops/ms
SIMDBenchmark.reduceUnrolled128   65536  thrpt    3    74,801 ±   0,211  ops/ms
SIMDBenchmark.reduceUnrolled128  131072  thrpt    3    36,835 ±   5,834  ops/ms
SIMDBenchmark.reduceUnrolled256    1024  thrpt    3  6750,436 ± 257,662  ops/ms
SIMDBenchmark.reduceUnrolled256   65536  thrpt    3   105,726 ±   0,525  ops/ms
SIMDBenchmark.reduceUnrolled256  131072  thrpt    3    50,739 ±  42,492  ops/ms
SIMDBenchmark.reduceVectorised     1024  thrpt    3   566,206 ±  21,687  ops/ms
SIMDBenchmark.reduceVectorised    65536  thrpt    3    55,265 ±   0,757  ops/ms
SIMDBenchmark.reduceVectorised   131072  thrpt    3    28,468 ±   2,976  ops/ms

Linux, JVM 10+43, Ryzen 1800X:
------------------------------

Benchmark                             (size)   Mode  Cnt     Score    Error   Units
SIMDBenchmarkFloat.reduceUnrolled128    1024  thrpt    5  4614.667 ± 70.817  ops/ms
SIMDBenchmarkFloat.reduceUnrolled128   65536  thrpt    5    71.675 ±  1.006  ops/ms
SIMDBenchmarkFloat.reduceUnrolled128  131072  thrpt    5    36.169 ±  0.308  ops/ms
SIMDBenchmarkFloat.reduceUnrolled256    1024  thrpt    5  5010.649 ± 28.092  ops/ms
SIMDBenchmarkFloat.reduceUnrolled256   65536  thrpt    5    78.317 ±  0.749  ops/ms
SIMDBenchmarkFloat.reduceUnrolled256  131072  thrpt    5    39.264 ±  0.432  ops/ms
SIMDBenchmarkFloat.reduceVectorised     1024  thrpt    5   572.878 ± 32.272  ops/ms
SIMDBenchmarkFloat.reduceVectorised    65536  thrpt    5    70.393 ±  1.993  ops/ms
SIMDBenchmarkFloat.reduceVectorised   131072  thrpt    5    37.340 ±  1.031  ops/ms
 */
@Benchmark
public float reduceUnrolled128() {
    float a0 = 0.0f;
    float a1 = 0.0f;
    float a2 = 0.0f;
    float a3 = 0.0f;
    for (int i = 0; i < data.length >> 2; i++) {
        a0 += data[i * 4 + 0];
        a1 += data[i * 4 + 1];
        a2 += data[i * 4 + 2];
        a3 += data[i * 4 + 3];
    }
    return a0 + a1 + a2 + a3;
}

@Benchmark
public float reduceUnrolled256() {
    float a0 = 0.0f;
    float a1 = 0.0f;
    float a2 = 0.0f;
    float a3 = 0.0f;
    float a4 = 0.0f;
    float a5 = 0.0f;
    float a6 = 0.0f;
    float a7 = 0.0f;
    for (int i = 0; i < data.length >> 3; i++) {
        a0 += data[i * 8 + 0];
        a1 += data[i * 8 + 1];
        a2 += data[i * 8 + 2];
        a3 += data[i * 8 + 3];
        a4 += data[i * 8 + 4];
        a5 += data[i * 8 + 5];
        a6 += data[i * 8 + 6];
        a7 += data[i * 8 + 7];
    }
    return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7;
}
	// double version:

	/*
	Windows 10, JVM 9.0.4+11, Ryzen 1800X:
	--------------------------------------
	Benchmark (size) Mode Cnt Score Error Units
	SIMDBenchmark.reduceBuffered 1024 thrpt 3 518,570 ± 27,117 ops/ms
	SIMDBenchmark.reduceBuffered 65536 thrpt 3 35,050 ± 0,297 ops/ms
	SIMDBenchmark.reduceBuffered 131072 thrpt 3 17,822 ± 1,511 ops/ms
	SIMDBenchmark.reduceSimple 1024 thrpt 3 1230,955 ± 24,972 ops/ms
	SIMDBenchmark.reduceSimple 65536 thrpt 3 18,628 ± 0,725 ops/ms
	SIMDBenchmark.reduceSimple 131072 thrpt 3 9,340 ± 0,217 ops/ms
	SIMDBenchmark.reduceUnrolled 1024 thrpt 3 4699,957 ± 121,889 ops/ms
	SIMDBenchmark.reduceUnrolled 65536 thrpt 3 73,777 ± 0,889 ops/ms
	SIMDBenchmark.reduceUnrolled 131072 thrpt 3 36,505 ± 3,219 ops/ms
	SIMDBenchmark.reduceVectorised 1024 thrpt 3 365,681 ± 26,555 ops/ms
	SIMDBenchmark.reduceVectorised 65536 thrpt 3 27,931 ± 1,371 ops/ms
	SIMDBenchmark.reduceVectorised 131072 thrpt 3 14,286 ± 0,930 ops/ms

	Linux, JVM 10+43, Ryzen 1800X:
	------------------------------
	Benchmark (size) Mode Cnt Score Error Units
	SIMDBenchmark.reduceUnrolled 1024 thrpt 5 4474.529 ± 29.028 ops/ms
	SIMDBenchmark.reduceUnrolled 65536 thrpt 5 70.467 ± 1.030 ops/ms
	SIMDBenchmark.reduceUnrolled 131072 thrpt 5 35.435 ± 0.293 ops/ms
	SIMDBenchmark.reduceVectorised 1024 thrpt 5 381.656 ± 10.818 ops/ms
	SIMDBenchmark.reduceVectorised 65536 thrpt 5 36.867 ± 1.764 ops/ms
	SIMDBenchmark.reduceVectorised 131072 thrpt 5 18.645 ± 0.546 ops/ms
	*/
	@Benchmark
	public double reduceVectorised() {
	double[] buffer = new double[1024];
	double[] temp = new double[1024];
	for (int i = 0; i < data.length >>> 10; ++i) {
	System.arraycopy(data, i * 1024, temp, 0, temp.length);
	for (int j = 0; j < 1024; ++j) {
	buffer[j] += temp[j];
	}
	}
	return reduce(buffer);
	}

	@Benchmark
	public double reduceUnrolled() {
	double a0 = 0.0;
	double a1 = 0.0;
	double a2 = 0.0;
	double a3 = 0.0;
	for (int i = 0; i < data.length >> 2; i++) {
	a0 += data[i * 4 + 0];
	a1 += data[i * 4 + 1];
	a2 += data[i * 4 + 2];
	a3 += data[i * 4 + 3];
	}
	return a0 + a1 + a2 + a3;
	}

	// float version:

	/*
	Windows 10, JVM 9.0.4+11, Ryzen 1800X:
	--------------------------------------
	Benchmark (size) Mode Cnt Score Error Units
	SIMDBenchmark.reduceBuffered 1024 thrpt 3 632,734 ± 338,190 ops/ms
	SIMDBenchmark.reduceBuffered 65536 thrpt 3 35,742 ± 4,967 ops/ms
	SIMDBenchmark.reduceBuffered 131072 thrpt 3 17,894 ± 2,126 ops/ms
	SIMDBenchmark.reduceSimple 1024 thrpt 3 1227,272 ± 90,193 ops/ms
	SIMDBenchmark.reduceSimple 65536 thrpt 3 18,640 ± 0,616 ops/ms
	SIMDBenchmark.reduceSimple 131072 thrpt 3 9,334 ± 0,171 ops/ms
	SIMDBenchmark.reduceUnrolled128 1024 thrpt 3 4695,636 ± 98,984 ops/ms
	SIMDBenchmark.reduceUnrolled128 65536 thrpt 3 74,801 ± 0,211 ops/ms
	SIMDBenchmark.reduceUnrolled128 131072 thrpt 3 36,835 ± 5,834 ops/ms
	SIMDBenchmark.reduceUnrolled256 1024 thrpt 3 6750,436 ± 257,662 ops/ms
	SIMDBenchmark.reduceUnrolled256 65536 thrpt 3 105,726 ± 0,525 ops/ms
	SIMDBenchmark.reduceUnrolled256 131072 thrpt 3 50,739 ± 42,492 ops/ms
	SIMDBenchmark.reduceVectorised 1024 thrpt 3 566,206 ± 21,687 ops/ms
	SIMDBenchmark.reduceVectorised 65536 thrpt 3 55,265 ± 0,757 ops/ms
	SIMDBenchmark.reduceVectorised 131072 thrpt 3 28,468 ± 2,976 ops/ms

	Linux, JVM 10+43, Ryzen 1800X:
	------------------------------

	Benchmark (size) Mode Cnt Score Error Units
	SIMDBenchmarkFloat.reduceUnrolled128 1024 thrpt 5 4614.667 ± 70.817 ops/ms
	SIMDBenchmarkFloat.reduceUnrolled128 65536 thrpt 5 71.675 ± 1.006 ops/ms
	SIMDBenchmarkFloat.reduceUnrolled128 131072 thrpt 5 36.169 ± 0.308 ops/ms
	SIMDBenchmarkFloat.reduceUnrolled256 1024 thrpt 5 5010.649 ± 28.092 ops/ms
	SIMDBenchmarkFloat.reduceUnrolled256 65536 thrpt 5 78.317 ± 0.749 ops/ms
	SIMDBenchmarkFloat.reduceUnrolled256 131072 thrpt 5 39.264 ± 0.432 ops/ms
	SIMDBenchmarkFloat.reduceVectorised 1024 thrpt 5 572.878 ± 32.272 ops/ms
	SIMDBenchmarkFloat.reduceVectorised 65536 thrpt 5 70.393 ± 1.993 ops/ms
	SIMDBenchmarkFloat.reduceVectorised 131072 thrpt 5 37.340 ± 1.031 ops/ms
	*/
	@Benchmark
	public float reduceUnrolled128() {
	float a0 = 0.0f;
	float a1 = 0.0f;
	float a2 = 0.0f;
	float a3 = 0.0f;
	for (int i = 0; i < data.length >> 2; i++) {
	a0 += data[i * 4 + 0];
	a1 += data[i * 4 + 1];
	a2 += data[i * 4 + 2];
	a3 += data[i * 4 + 3];
	}
	return a0 + a1 + a2 + a3;
	}

	@Benchmark
	public float reduceUnrolled256() {
	float a0 = 0.0f;
	float a1 = 0.0f;
	float a2 = 0.0f;
	float a3 = 0.0f;
	float a4 = 0.0f;
	float a5 = 0.0f;
	float a6 = 0.0f;
	float a7 = 0.0f;
	for (int i = 0; i < data.length >> 3; i++) {
	a0 += data[i * 8 + 0];
	a1 += data[i * 8 + 1];
	a2 += data[i * 8 + 2];
	a3 += data[i * 8 + 3];
	a4 += data[i * 8 + 4];
	a5 += data[i * 8 + 5];
	a6 += data[i * 8 + 6];
	a7 += data[i * 8 + 7];
	}
	return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7;
	}