Skip to content

Instantly share code, notes, and snippets.

@lajash
Created January 30, 2015 08:17
Show Gist options
  • Save lajash/991c1bd6a1fc9d3ffa95 to your computer and use it in GitHub Desktop.
Save lajash/991c1bd6a1fc9d3ffa95 to your computer and use it in GitHub Desktop.
Branch FFT : Test Log
This file has been truncated, but you can view the full file.
Start testing: Jan 30 13:43 IST
----------------------------------------------------------
1/30 Testing: boost_version
1/30 Test: boost_version
Command: "/tmp/vexcl/build/tests/boost_version"
Directory: /tmp/vexcl/build/tests
"boost_version" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
Boost version: 105600
<end of output>
Test time = 0.07 sec
----------------------------------------------------------
Test Passed.
"boost_version" end time: Jan 30 13:43 IST
"boost_version" time elapsed: 00:00:00
----------------------------------------------------------
2/30 Testing: types
2/30 Test: types
Command: "/tmp/vexcl/build/tests/types"
Directory: /tmp/vexcl/build/tests
"types" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
Running 2 test cases...
*** No errors detected
<end of output>
Test time = 0.01 sec
----------------------------------------------------------
Test Passed.
"types" end time: Jan 30 13:43 IST
"types" time elapsed: 00:00:00
----------------------------------------------------------
3/30 Testing: deduce
3/30 Test: deduce
Command: "/tmp/vexcl/build/tests/deduce"
Directory: /tmp/vexcl/build/tests
"deduce" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 9 test cases...
terminal(5)
int
terminal(4.2)
double
terminal(N3vex15vector_terminalE)
double
terminal(N3vex15vector_terminalE)
int
terminal(N3vex15vector_terminalE)
double2
terminal(N3vex10elem_indexE)
ulong
terminal(N3vex12mba_terminalE)
float
terminal(N3vex24tagged_terminal_terminalE)
double
terminal(N3vex18temporary_terminalE)
double
terminal(N3vex20vector_view_terminalE)
int
less(
terminal(N3vex15vector_terminalE)
, terminal(N3vex15vector_terminalE)
)
long
greater(
terminal(5)
, function(
terminal(N3vex8pow_funcE)
, terminal(N3vex15vector_terminalE)
, multiplies(
terminal(2)
, terminal(N3vex15vector_terminalE)
)
)
)
long
logical_not(
terminal(N3vex15vector_terminalE)
)
long
plus(
terminal(N3vex15vector_terminalE)
, terminal(N3vex15vector_terminalE)
)
double
plus(
terminal(N3vex15vector_terminalE)
, multiplies(
terminal(2)
, terminal(N3vex15vector_terminalE)
)
)
double
negate(
terminal(N3vex15vector_terminalE)
)
int
multiplies(
terminal(N3vex15vector_terminalE)
, terminal(N3vex15vector_terminalE)
)
double2
multiplies(
terminal(N3vex15vector_terminalE)
, terminal(N3vex15vector_terminalE)
)
double2
multiplies(
terminal(N3vex15vector_terminalE)
, terminal(N3vex15vector_terminalE)
)
double2
function(
terminal(ZN2cr14user_functions11test_methodEvE15vex_function_f1)
, terminal(N3vex15vector_terminalE)
)
double
function(
terminal(ZN2cr14user_functions11test_methodEvE15vex_function_f2)
, terminal(N3vex15vector_terminalE)
, terminal(N3vex15vector_terminalE)
)
int
function(
terminal(ZN2cr14user_functions11test_methodEvE15vex_function_f2)
, plus(
terminal(N3vex15vector_terminalE)
, terminal(N3vex15vector_terminalE)
)
, minus(
terminal(N3vex15vector_terminalE)
, terminal(N3vex15vector_terminalE)
)
)
int
if_else_(
less(
terminal(N3vex15vector_terminalE)
, terminal(0)
)
, terminal(1)
, terminal(N3vex15vector_terminalE)
)
int
dereference(
if_else_(
less(
terminal(N3vex15vector_terminalE)
, terminal(0)
)
, address_of(
terminal(N3vex15vector_terminalE)
)
, address_of(
terminal(N3vex15vector_terminalE)
)
)
)
double
minus(
function(
terminal(N3vex8cos_funcE)
, terminal(N3vex15vector_terminalE)
)
, function(
terminal(N3vex8sin_funcE)
, terminal(N3vex15vector_terminalE)
)
)
double
function(
terminal(N3vex8pow_funcE)
, terminal(N3vex15vector_terminalE)
, multiplies(
terminal(2)
, terminal(N3vex15vector_terminalE)
)
)
double
terminal(N3vex28reduced_vector_view_terminalE)
double
terminal(N3vex13cast_terminalE)
double
terminal(N3vex13cast_terminalE)
int
*** No errors detected
<end of output>
Test time = 0.02 sec
----------------------------------------------------------
Test Passed.
"deduce" end time: Jan 30 13:43 IST
"deduce" time elapsed: 00:00:00
----------------------------------------------------------
4/30 Testing: context
4/30 Test: context
Command: "/tmp/vexcl/build/tests/context"
Directory: /tmp/vexcl/build/tests
"context" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
Running 1 test case...
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
*** No errors detected
<end of output>
Test time = 0.03 sec
----------------------------------------------------------
Test Passed.
"context" end time: Jan 30 13:43 IST
"context" time elapsed: 00:00:00
----------------------------------------------------------
5/30 Testing: vector_create
5/30 Test: vector_create
Command: "/tmp/vexcl/build/tests/vector_create"
Directory: /tmp/vexcl/build/tests
"vector_create" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 14 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2[idx];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global uint * prm_1,
ulong prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global uint * prm_1,
ulong prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] -= prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = sin( prm_2[idx] );
}
}
*** No errors detected
<end of output>
Test time = 0.04 sec
----------------------------------------------------------
Test Passed.
"vector_create" end time: Jan 30 13:43 IST
"vector_create" time elapsed: 00:00:00
----------------------------------------------------------
6/30 Testing: vector_copy
6/30 Test: vector_copy
Command: "/tmp/vexcl/build/tests/vector_copy"
Directory: /tmp/vexcl/build/tests
"vector_copy" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 8 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong MAX_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global ulong * prm_1,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MAX_ulong(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_expr_1,
global ulong * prm_2_slice_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[prm_2_slice_1[idx]];
}
}
*** No errors detected
<end of output>
Test time = 0.18 sec
----------------------------------------------------------
Test Passed.
"vector_copy" end time: Jan 30 13:43 IST
"vector_copy" time elapsed: 00:00:00
----------------------------------------------------------
7/30 Testing: vector_arithmetics
7/30 Test: vector_arithmetics
Command: "/tmp/vexcl/build/tests/vector_arithmetics"
Directory: /tmp/vexcl/build/tests
"vector_arithmetics" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 18 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2,
global double * prm_3,
global double * prm_4
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( ( prm_2 * sin( prm_3[idx] ) ) + prm_4[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] += prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] -= prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double SUM_double
(
double prm1,
double prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * g_odata
)
{
double mySum = (double)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_double(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double SUM_double
(
double prm1,
double prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * g_odata
)
{
double mySum = (double)0, c = (double)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double y = (prm_1[idx]) - c;
double t = mySum + y;
c = (t - mySum) - y;
mySum = t;
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double MIN_double
(
double prm1,
double prm2
)
{
return prm1 < prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * g_odata
)
{
double mySum = (double)1.79769e+308;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MIN_double(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double MAX_double
(
double prm1,
double prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * g_odata
)
{
double mySum = (double)-1.79769e+308;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MAX_double(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double MAX_double
(
double prm1,
double prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
global double * g_odata
)
{
double mySum = (double)-1.79769e+308;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MAX_double(mySum, fabs( ( prm_1[idx] - prm_2[idx] ) ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
double prm_3,
global double * prm_4,
double prm_5
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( pow( sin( prm_2[idx] ), prm_3 ) + pow( cos( prm_4[idx] ), prm_5 ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong SUM_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 + prm2;
}
ulong greater
(
double x,
double y
)
{
return x > y;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_ulong(mySum, greater( prm_1[idx], prm_2[idx] ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong SUM_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 + prm2;
}
double times2
(
double x
)
{
return x * 2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_ulong(mySum, times2( prm_1[idx] ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong SUM_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 + prm2;
}
double times4
(
double x
)
{
return x * 4;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_ulong(mySum, times4( prm_1[idx] ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
double prm_2,
ulong prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = sin( ( prm_2 * (prm_3 + idx) ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int4 make_int4
(
int x
)
{
return (int4)(x, x, x, x);
}
kernel void vexcl_vector_kernel
(
ulong n,
global int4 * prm_1,
int4 prm_2,
int prm_3,
ulong prm_4
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2 * make_int4( ( prm_3 + (prm_4 + idx) ) ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int4 make_int4
(
int x
)
{
return (int4)(x, x, x, x);
}
kernel void vexcl_vector_kernel
(
ulong n,
global int4 * prm_1,
int4 prm_2,
int prm_3,
ulong prm_4
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2 * make_int4( ( prm_3 + (prm_4 + idx) ) ) );
}
}
<program source>:10:1: error: too few arguments provided to function-like macro invocation
)
^
<program source>:7:6: error: global variables must have a constant address space qualifier
int4 make_int4
^
<program source>:7:15: error: expected ';' after top level declarator
int4 make_int4
^
;
<program source>:29:65: error: too few arguments provided to function-like macro invocation
prm_1[idx] = ( prm_2 * make_int4( ( prm_3 + (prm_4 + idx) ) ) );
^
unknown location:0: fatal error in "vector_values": std::exception: clBuildProgram
/tmp/vexcl/tests/vector_arithmetics.cpp:142: last checkpoint
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int f
(
int x
)
{
return 2 * x;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = f( f( prm_2[idx] ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int g
(
int x
)
{
return 3 * x;
}
int f
(
int x
)
{
return 2 * x;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = g( f( prm_2[idx] ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
#define THE_ANSWER 42
int answer
(
int x
)
{
return x * THE_ANSWER;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = answer( prm_2 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double sin2
(
double x
)
{
return pow(sin(x), 2.0);
}
double cos2
(
double x
)
{
return pow(cos(x), 2.0);
}
double one
(
double x
)
{
return sin2(x) + cos2(x);
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = one( prm_2[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
double prm_3,
global double * prm_4,
global double * prm_5
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( ( prm_2[idx] > prm_3 ) ? sin( prm_4[idx] ) : cos( prm_5[idx] ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
double prm_2,
global double * prm_3,
global double * prm_4,
int prm_5
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
( *( ( ( prm_1[idx] < prm_2 ) ? ( &( prm_3[idx] ) ) : ( &( prm_4[idx] ) ) ) ) ) = prm_5;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
ulong prm_tag_1_1,
double prm_4,
double prm_7
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( pow( sin( ( ( 6.2831853071795862e+00 ) * (prm_tag_1_1 + idx) ) ), prm_4 ) + pow( cos( ( ( 6.2831853071795862e+00 ) * (prm_tag_1_1 + idx) ) ), prm_7 ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = 42;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = 42;
}
}
CVMS_ERROR_COMPILER_FAILURE: CVMS compiler has crashed or hung building an element.
unknown location:0: fatal error in "constants": std::exception: clBuildProgram
/tmp/vexcl/tests/vector_arithmetics.cpp:270: last checkpoint
*** 2 failures detected in test suite "VectorArithmetics"
<end of output>
Test time = 0.06 sec
----------------------------------------------------------
Test Failed.
"vector_arithmetics" end time: Jan 30 13:43 IST
"vector_arithmetics" time elapsed: 00:00:00
----------------------------------------------------------
8/30 Testing: vector_view
8/30 Test: vector_view
Command: "/tmp/vexcl/build/tests/vector_view"
Directory: /tmp/vexcl/build/tests
"vector_view" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 16 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_2_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong idx
)
{
return start + idx * stride0;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_expr_1,
ulong prm_2_slice_start,
ulong prm_2_slice_length0,
long prm_2_slice_stride0
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, idx)];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_2_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong idx
)
{
return start + idx * stride0;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_expr_1,
global double * prm_2_expr_2,
ulong prm_2_slice_start,
ulong prm_2_slice_length0,
long prm_2_slice_stride0
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double prm_2_val;
{
size_t pos = prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, idx);
size_t idx = pos;
prm_2_val = ( prm_2_expr_1[idx] * prm_2_expr_2[idx] );
}
prm_1[idx] = prm_2_val;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_2_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong length1,
long stride1,
ulong idx
)
{
size_t ptr = start + (idx % length1) * stride1;
idx /= length1;
ptr += (idx % length0) * stride0;
return ptr;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_expr_1,
ulong prm_2_slice_start,
ulong prm_2_slice_length0,
long prm_2_slice_stride0,
ulong prm_2_slice_length1,
long prm_2_slice_stride1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, prm_2_slice_length1, prm_2_slice_stride1, idx)];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_2_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong length1,
long stride1,
ulong idx
)
{
size_t ptr = start + (idx % length1) * stride1;
idx /= length1;
ptr += (idx % length0) * stride0;
return ptr;
}
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2_expr_1,
ulong prm_2_slice_start,
ulong prm_2_slice_length0,
long prm_2_slice_stride0,
ulong prm_2_slice_length1,
long prm_2_slice_stride1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, prm_2_slice_length1, prm_2_slice_stride1, idx)];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global ulong * prm_1,
ulong prm_2,
ulong prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2 - (prm_3 + idx) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong MAX_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global ulong * prm_1,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MAX_ulong(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_expr_1,
global ulong * prm_2_slice_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[prm_2_slice_1[idx]];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_expr_1,
global double * prm_2_expr_2,
global ulong * prm_2_slice_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double prm_2_val;
{
size_t pos = prm_2_slice_1[idx];
size_t idx = pos;
prm_2_val = ( prm_2_expr_1[idx] * prm_2_expr_2[idx] );
}
prm_1[idx] = prm_2_val;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong MAX_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
ulong prm_1,
ulong prm_2,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MAX_ulong(mySum, ( prm_1 - (prm_2 + idx) ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_expr_1,
ulong prm_2_slice_1,
ulong prm_2_slice_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[( prm_2_slice_1 - (prm_2_slice_2 + idx) )];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1_expr_1,
ulong prm_1_slice_1,
ulong prm_1_slice_2,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1_expr_1[( prm_1_slice_1 - (prm_1_slice_2 + idx) )] = prm_2[idx];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_expr_1,
ulong prm_2_slice_1,
ulong prm_2_slice_2_1,
int prm_2_slice_2_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
ulong temp_1 = ( (prm_2_slice_2_1 + idx) + prm_2_slice_2_2 );
prm_1[idx] = prm_2_expr_1[( prm_2_slice_1 - temp_1 )];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int SUM_int
(
int prm1,
int prm2
)
{
return prm1 + prm2;
}
ulong prm_1_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong idx
)
{
return start + idx * stride0;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global int * prm_1_expr_1,
ulong prm_1_slice_start,
ulong prm_1_slice_length0,
long prm_1_slice_stride0,
global int * g_odata
)
{
int mySum = (int)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_int(mySum, prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, idx)]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_1_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong idx
)
{
return start + idx * stride0;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1_expr_1,
ulong prm_1_slice_start,
ulong prm_1_slice_length0,
long prm_1_slice_stride0,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, idx)] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_1_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong length1,
long stride1,
ulong idx
)
{
size_t ptr = start + (idx % length1) * stride1;
idx /= length1;
ptr += (idx % length0) * stride0;
return ptr;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1_expr_1,
ulong prm_1_slice_start,
ulong prm_1_slice_length0,
long prm_1_slice_stride0,
ulong prm_1_slice_length1,
long prm_1_slice_stride1,
ulong prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, prm_1_slice_length1, prm_1_slice_stride1, idx)] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_1_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong length1,
long stride1,
ulong idx
)
{
size_t ptr = start + (idx % length1) * stride1;
idx /= length1;
ptr += (idx % length0) * stride0;
return ptr;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1_expr_1,
ulong prm_1_slice_start,
ulong prm_1_slice_length0,
long prm_1_slice_stride0,
ulong prm_1_slice_length1,
long prm_1_slice_stride1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, prm_1_slice_length1, prm_1_slice_stride1, idx)] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global ulong * prm_1,
ulong prm_2,
ulong prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( (prm_2 + idx) * prm_3 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1_expr_1,
global ulong * prm_1_slice_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1_expr_1[prm_1_slice_1[idx]] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int SUM_int
(
int prm1,
int prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2_1,
ulong prm_2_start,
ulong prm_2_length0,
long prm_2_stride0
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
int prm_2_sum = (int)0;
{
size_t pos = idx;
size_t ptr0 = prm_2_start;
for(size_t i0 = 0, ptr1 = ptr0; i0 < prm_2_length0; ++i0, ptr1 += prm_2_stride0)
{
size_t idx = ptr1;
prm_2_sum = SUM_int(prm_2_sum, prm_2_1[idx]);
}
}
prm_1[idx] = prm_2_sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int SUM_int
(
int prm1,
int prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2_1,
ulong prm_2_start,
ulong prm_2_length0,
long prm_2_stride0,
ulong prm_2_length1,
long prm_2_stride1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
int prm_2_sum = (int)0;
{
size_t pos = idx;
size_t ptr0 = prm_2_start;
for(size_t i0 = 0, ptr1 = ptr0; i0 < prm_2_length0; ++i0, ptr1 += prm_2_stride0)
{
for(size_t i1 = 0, ptr2 = ptr1; i1 < prm_2_length1; ++i1, ptr2 += prm_2_stride1)
{
size_t idx = ptr2;
prm_2_sum = SUM_int(prm_2_sum, prm_2_1[idx]);
}
}
}
prm_1[idx] = prm_2_sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_1_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong length1,
long stride1,
ulong idx
)
{
size_t ptr = start + (idx % length1) * stride1;
idx /= length1;
ptr += (idx % length0) * stride0;
return ptr;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1_expr_1,
ulong prm_1_slice_start,
ulong prm_1_slice_length0,
long prm_1_slice_stride0,
ulong prm_1_slice_length1,
long prm_1_slice_stride1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, prm_1_slice_length1, prm_1_slice_stride1, idx)] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int SUM_int
(
int prm1,
int prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2_1,
ulong prm_2_start,
ulong prm_2_length0,
long prm_2_stride0,
ulong prm_2_length1,
long prm_2_stride1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
int prm_2_sum = (int)0;
{
size_t pos = idx;
size_t ptr1 = prm_2_start + (pos % prm_2_length0) * prm_2_stride0;
for(size_t i1 = 0, ptr2 = ptr1; i1 < prm_2_length1; ++i1, ptr2 += prm_2_stride1)
{
size_t idx = ptr2;
prm_2_sum = SUM_int(prm_2_sum, prm_2_1[idx]);
}
}
prm_1[idx] = prm_2_sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int SUM_int
(
int prm1,
int prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2_1_1,
ulong prm_2_start,
ulong prm_2_length0,
long prm_2_stride0,
ulong prm_2_length1,
long prm_2_stride1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
int prm_2_sum = (int)0;
{
size_t pos = idx;
size_t ptr1 = prm_2_start + (pos % prm_2_length0) * prm_2_stride0;
for(size_t i1 = 0, ptr2 = ptr1; i1 < prm_2_length1; ++i1, ptr2 += prm_2_stride1)
{
size_t idx = ptr2;
int temp_1 = prm_2_1_1[idx];
prm_2_sum = SUM_int(prm_2_sum, ( temp_1 * temp_1 ));
}
}
prm_1[idx] = prm_2_sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int SUM_int
(
int prm1,
int prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2_1,
ulong prm_2_start,
ulong prm_2_length0,
long prm_2_stride0,
ulong prm_2_length1,
long prm_2_stride1,
ulong prm_2_length2,
long prm_2_stride2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
int prm_2_sum = (int)0;
{
size_t pos = idx;
size_t ptr1 = prm_2_start + (pos % prm_2_length0) * prm_2_stride0;
for(size_t i1 = 0, ptr2 = ptr1; i1 < prm_2_length1; ++i1, ptr2 += prm_2_stride1)
{
for(size_t i2 = 0, ptr3 = ptr2; i2 < prm_2_length2; ++i2, ptr3 += prm_2_stride2)
{
size_t idx = ptr3;
prm_2_sum = SUM_int(prm_2_sum, prm_2_1[idx]);
}
}
}
prm_1[idx] = prm_2_sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double SUM_double
(
double prm1,
double prm2
)
{
return prm1 + prm2;
}
double MAX_double
(
double prm1,
double prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_1_1,
ulong prm_2_1_start,
ulong prm_2_1_length0,
long prm_2_1_stride0,
ulong prm_2_1_length1,
long prm_2_1_stride1,
ulong prm_2_1_length2,
long prm_2_1_stride2,
ulong prm_2_start,
ulong prm_2_length0,
long prm_2_stride0,
ulong prm_2_length1,
long prm_2_stride1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double prm_2_sum = (double)-1.79769e+308;
{
size_t pos = idx;
size_t ptr1 = prm_2_start + (pos % prm_2_length0) * prm_2_stride0;
for(size_t i1 = 0, ptr2 = ptr1; i1 < prm_2_length1; ++i1, ptr2 += prm_2_stride1)
{
size_t idx = ptr2;
double prm_2_1_sum = (double)0;
{
size_t pos = idx;
size_t ptr2 = prm_2_1_start + (pos % prm_2_1_length1) * prm_2_1_stride1;
pos /= prm_2_1_length1;
ptr2 += (pos % prm_2_1_length0) * prm_2_1_stride0;
for(size_t i2 = 0, ptr3 = ptr2; i2 < prm_2_1_length2; ++i2, ptr3 += prm_2_1_stride2)
{
size_t idx = ptr3;
prm_2_1_sum = SUM_double(prm_2_1_sum, sin( prm_2_1_1[idx] ));
}
}
prm_2_sum = MAX_double(prm_2_sum, prm_2_1_sum);
}
}
prm_1[idx] = prm_2_sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
ulong prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = (prm_2 + idx);
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong MAX_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
ulong prm_1,
ulong prm_2,
ulong prm_3,
ulong prm_4,
ulong prm_5,
ulong prm_6,
ulong prm_7,
ulong prm_8,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MAX_ulong(mySum, ( ( prm_1 * ( ( (prm_2 + idx) / prm_3 ) % prm_4 ) ) + ( prm_5 * ( ( (prm_6 + idx) / prm_7 ) % prm_8 ) ) ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2_expr_1,
ulong prm_2_slice_1,
ulong prm_2_slice_2,
ulong prm_2_slice_3,
ulong prm_2_slice_4,
ulong prm_2_slice_5,
ulong prm_2_slice_6,
ulong prm_2_slice_7,
ulong prm_2_slice_8
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[( ( prm_2_slice_1 * ( ( (prm_2_slice_2 + idx) / prm_2_slice_3 ) % prm_2_slice_4 ) ) + ( prm_2_slice_5 * ( ( (prm_2_slice_6 + idx) / prm_2_slice_7 ) % prm_2_slice_8 ) ) )];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_2_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong length1,
long stride1,
ulong idx
)
{
size_t ptr = start + (idx % length1) * stride1;
idx /= length1;
ptr += (idx % length0) * stride0;
return ptr;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2_expr_1,
ulong prm_2_slice_start,
ulong prm_2_slice_length0,
long prm_2_slice_stride0,
ulong prm_2_slice_length1,
long prm_2_slice_stride1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, prm_2_slice_length1, prm_2_slice_stride1, idx)];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong MAX_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
ulong prm_1,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MAX_ulong(mySum, (prm_1 + idx));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2_expr_1,
ulong prm_2_slice_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[(prm_2_slice_1 + idx)];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2_expr_1,
ulong prm_2_slice_1,
ulong prm_2_slice_2,
ulong prm_2_slice_3,
ulong prm_2_slice_4,
ulong prm_2_slice_5,
ulong prm_2_slice_6,
ulong prm_2_slice_7,
ulong prm_2_slice_8,
ulong prm_2_slice_9,
ulong prm_2_slice_10,
ulong prm_2_slice_11,
ulong prm_2_slice_12,
ulong prm_2_slice_13,
ulong prm_2_slice_14,
ulong prm_2_slice_15,
ulong prm_2_slice_16
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[( ( prm_2_slice_1 * ( ( (prm_2_slice_2 + idx) / prm_2_slice_3 ) % prm_2_slice_4 ) ) + ( ( prm_2_slice_5 * ( ( (prm_2_slice_6 + idx) / prm_2_slice_7 ) % prm_2_slice_8 ) ) + ( ( prm_2_slice_9 * ( ( (prm_2_slice_10 + idx) / prm_2_slice_11 ) % prm_2_slice_12 ) ) + ( prm_2_slice_13 * ( ( (prm_2_slice_14 + idx) / prm_2_slice_15 ) % prm_2_slice_16 ) ) ) ) )];
}
}
*** No errors detected
<end of output>
Test time = 0.08 sec
----------------------------------------------------------
Test Passed.
"vector_view" end time: Jan 30 13:43 IST
"vector_view" time elapsed: 00:00:00
----------------------------------------------------------
9/30 Testing: vector_pointer
9/30 Test: vector_pointer
Command: "/tmp/vexcl/build/tests/vector_pointer"
Directory: /tmp/vexcl/build/tests
"vector_pointer" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 3 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double nbody
(
ulong n,
ulong j,
global double * x
)
{
double sum = 0; for(size_t i = 0; i < n; ++i) if (i != j) sum += x[i]; return sum;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
ulong prm_2,
ulong prm_3,
global double * prm_4
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = nbody( prm_2, (prm_3 + idx), prm_4 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
ulong prm_3_1,
global double * prm_5,
global double * prm_7,
ulong prm_tag_1_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
ulong temp_1 = (prm_3_1 + idx);
ulong temp_2 = ( ( temp_1 > ( 0 ) ) ? ( temp_1 - ( 1 ) ) : temp_1 );
ulong temp_3 = ( ( ( temp_1 + ( 1 ) ) < prm_tag_1_1 ) ? ( temp_1 + ( 1 ) ) : temp_1 );
prm_1[idx] = ( ( ( ( *( ( prm_2 + temp_1 ) ) ) * ( 2 ) ) - ( *( ( prm_5 + temp_2 ) ) ) ) - ( *( ( prm_7 + temp_3 ) ) ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
ulong prm_3_1,
global double * prm_5,
global double * prm_7,
ulong prm_tag_1_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
ulong temp_1 = (prm_3_1 + idx);
ulong temp_2 = ( ( temp_1 > ( 0 ) ) ? ( temp_1 - ( 1 ) ) : temp_1 );
ulong temp_3 = ( ( ( temp_1 + ( 1 ) ) < prm_tag_1_1 ) ? ( temp_1 + ( 1 ) ) : temp_1 );
prm_1[idx] = ( ( ( ( ( prm_2 )[ temp_1 ] ) * ( 2 ) ) - ( ( prm_5 )[ temp_2 ] ) ) - ( ( prm_7 )[ temp_3 ] ) );
}
}
*** No errors detected
<end of output>
Test time = 0.03 sec
----------------------------------------------------------
Test Passed.
"vector_pointer" end time: Jan 30 13:43 IST
"vector_pointer" time elapsed: 00:00:00
----------------------------------------------------------
10/30 Testing: tagged_terminal
10/30 Test: tagged_terminal
Command: "/tmp/vexcl/build/tests/tagged_terminal"
Directory: /tmp/vexcl/build/tests
"tagged_terminal" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 5 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_tag_1_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_tag_1_1[idx] * prm_tag_1_1[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double SUM_double
(
double prm1,
double prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
int prm_tag_3_1,
global double * prm_tag_1_1,
global double * prm_tag_2_1,
global double * g_odata
)
{
double mySum = (double)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_double(mySum, ( ( ( prm_tag_3_1 * prm_tag_1_1[idx] ) * prm_tag_1_1[idx] ) + ( ( prm_tag_3_1 * prm_tag_2_1[idx] ) * prm_tag_2_1[idx] ) ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_tag_1_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_tag_1_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_tag_1_1,
ulong prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_tag_1_1[idx] = ( prm_tag_1_1[idx] + (prm_3 + idx) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_tag_3_1_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong idx
)
{
return start + idx * stride0;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_tag_1_1,
global double * prm_tag_2_1,
global double * prm_tag_3_1_expr_1,
ulong prm_tag_3_1_slice_start,
ulong prm_tag_3_1_slice_length0,
long prm_tag_3_1_slice_stride0
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( ( prm_tag_1_1 * prm_tag_2_1[idx] ) + prm_tag_3_1_expr_1[prm_tag_3_1_slice_func(prm_tag_3_1_slice_start, prm_tag_3_1_slice_length0, prm_tag_3_1_slice_stride0, idx)] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_tag_1_1_expr_1,
ulong prm_tag_1_1_slice_1,
ulong prm_tag_1_1_slice_2_1,
int prm_tag_1_1_slice_2_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
ulong temp_1 = ( (prm_tag_1_1_slice_2_1 + idx) + prm_tag_1_1_slice_2_2 );
prm_1[idx] = prm_tag_1_1_expr_1[( prm_tag_1_1_slice_1 - temp_1 )];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_tag_1_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_tag_1_1[idx];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_tag_0_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_tag_0_1[idx];
}
}
*** No errors detected
<end of output>
Test time = 0.04 sec
----------------------------------------------------------
Test Passed.
"tagged_terminal" end time: Jan 30 13:43 IST
"tagged_terminal" time elapsed: 00:00:00
----------------------------------------------------------
11/30 Testing: temporary
11/30 Test: temporary
Command: "/tmp/vexcl/build/tests/temporary"
Directory: /tmp/vexcl/build/tests
"temporary" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 6 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double sqr
(
double x
)
{
return x * x;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_1,
int prm_2_2,
global double * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double temp_1 = ( sqr( prm_2_1[idx] ) + prm_2_2 );
prm_1[idx] = ( temp_1 * ( prm_3[idx] + temp_1 ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_1,
global double * prm_3_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double temp_1 = log( prm_2_1[idx] );
double temp_2 = ( temp_1 + sin( prm_3_2[idx] ) );
prm_1[idx] = ( temp_1 * temp_2 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double SUM_double
(
double prm1,
double prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
int prm_1,
global double * prm_2_1,
int prm_2_2,
global double * prm_3_1,
int prm_3_2,
global double * g_odata
)
{
double mySum = (double)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double temp_1 = pow( sin( prm_2_1[idx] ), prm_2_2 );
double temp_2 = pow( cos( prm_3_1[idx] ), prm_3_2 );
mySum = SUM_double(mySum, ( prm_1 * ( temp_1 + temp_2 ) ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double temp_1 = sin( prm_2_1[idx] );
prm_1[idx] = temp_1;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2,
global double * prm_3_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double temp_1 = sin( prm_3_1[idx] );
prm_1[idx] = sqrt( ( prm_2 - ( temp_1 * temp_1 ) ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double temp_1000 = tan( prm_2_1[idx] );
prm_1[idx] = ( temp_1000 * temp_1000 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double temp_1001 = tan( prm_2_1[idx] );
prm_1[idx] = ( temp_1001 * temp_1001 );
}
}
*** No errors detected
<end of output>
Test time = 0.04 sec
----------------------------------------------------------
Test Passed.
"temporary" end time: Jan 30 13:43 IST
"temporary" time elapsed: 00:00:00
----------------------------------------------------------
12/30 Testing: cast
12/30 Test: cast
Command: "/tmp/vexcl/build/tests/cast"
Directory: /tmp/vexcl/build/tests
"cast" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 3 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_1;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int2 * prm_1,
float2 prm_2_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = convert_int2( prm_2_1 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int2 * prm_1,
float2 prm_2_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = as_int2( prm_2_1 );
}
}
*** No errors detected
<end of output>
Test time = 0.03 sec
----------------------------------------------------------
Test Passed.
"cast" end time: Jan 30 13:43 IST
"cast" time elapsed: 00:00:00
----------------------------------------------------------
13/30 Testing: multivector_create
13/30 Test: multivector_create
Command: "/tmp/vexcl/build/tests/multivector_create"
Directory: /tmp/vexcl/build/tests
"multivector_create" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 5 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2[idx];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global uint * prm_1,
ulong prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
*** No errors detected
<end of output>
Test time = 0.06 sec
----------------------------------------------------------
Test Passed.
"multivector_create" end time: Jan 30 13:43 IST
"multivector_create" time elapsed: 00:00:00
----------------------------------------------------------
14/30 Testing: multivector_arithmetics
14/30 Test: multivector_arithmetics
Command: "/tmp/vexcl/build/tests/multivector_arithmetics"
Directory: /tmp/vexcl/build/tests
"multivector_arithmetics" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 11 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
double prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double MIN_double
(
double prm1,
double prm2
)
{
return prm1 < prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * g_odata
)
{
double mySum = (double)1.79769e+308;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MIN_double(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double MAX_double
(
double prm1,
double prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * g_odata
)
{
double mySum = (double)-1.79769e+308;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MAX_double(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2,
global double * prm_3,
global double * prm_4
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( ( prm_2 * prm_3[idx] ) + prm_4[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
global double * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( sin( prm_2[idx] ) + cos( prm_3[idx] ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
global double * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( cos( prm_2[idx] ) + sin( prm_3[idx] ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
global double * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
global double * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] - prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
double prm_3,
global double * prm_4,
double prm_5
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( pow( sin( prm_2[idx] ), prm_3 ) + pow( cos( prm_4[idx] ), prm_5 ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong greater
(
double x,
double y
)
{
return x > y;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
global double * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = greater( prm_2[idx], prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double SUM_double
(
double prm1,
double prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * g_odata
)
{
double mySum = (double)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_double(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
double prm_2,
ulong prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2 * (prm_3 + idx) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
double prm_2,
ulong prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = sin( ( prm_2 * (prm_3 + idx) ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
double prm_2,
ulong prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = cos( ( prm_2 * (prm_3 + idx) ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2,
global double * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] += sin( ( prm_2 * prm_3[idx] ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2,
global double * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] -= sin( ( prm_2 * prm_3[idx] ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] *= prm_2[idx];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] *= sin( prm_2[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = 42;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = 42;
}
}
CVMS_ERROR_COMPILER_FAILURE: CVMS compiler has crashed or hung building an element.
unknown location:0: fatal error in "integral_constants": std::exception: clBuildProgram
/tmp/vexcl/tests/multivector_arithmetics.cpp:209: last checkpoint
*** 1 failure detected in test suite "MultivectorArithmetics"
<end of output>
Test time = 0.06 sec
----------------------------------------------------------
Test Failed.
"multivector_arithmetics" end time: Jan 30 13:43 IST
"multivector_arithmetics" time elapsed: 00:00:00
----------------------------------------------------------
15/30 Testing: multi_array
15/30 Test: multi_array
Command: "/tmp/vexcl/build/tests/multi_array"
Directory: /tmp/vexcl/build/tests
"multi_array" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 5 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
double prm_2,
ulong prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2 * (prm_3 + idx) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
double prm_3,
global double * prm_4,
double prm_5
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( pow( sin( prm_2[idx] ), prm_3 ) + pow( cos( prm_4[idx] ), prm_5 ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_1_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong length1,
long stride1,
ulong length2,
long stride2,
ulong idx
)
{
size_t ptr = start + (idx % length2) * stride2;
idx /= length2;
ptr += (idx % length1) * stride1;
idx /= length1;
ptr += (idx % length0) * stride0;
return ptr;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1_expr_1,
ulong prm_1_slice_start,
ulong prm_1_slice_length0,
long prm_1_slice_stride0,
ulong prm_1_slice_length1,
long prm_1_slice_stride1,
ulong prm_1_slice_length2,
long prm_1_slice_stride2,
ulong prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, prm_1_slice_length1, prm_1_slice_stride1, prm_1_slice_length2, prm_1_slice_stride2, idx)] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong prm_1_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong length1,
long stride1,
ulong length2,
long stride2,
ulong idx
)
{
size_t ptr = start + (idx % length2) * stride2;
idx /= length2;
ptr += (idx % length1) * stride1;
idx /= length1;
ptr += (idx % length0) * stride0;
return ptr;
}
ulong prm_2_slice_func
(
ulong start,
ulong length0,
long stride0,
ulong length1,
long stride1,
ulong length2,
long stride2,
ulong idx
)
{
size_t ptr = start + (idx % length2) * stride2;
idx /= length2;
ptr += (idx % length1) * stride1;
idx /= length1;
ptr += (idx % length0) * stride0;
return ptr;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1_expr_1,
ulong prm_1_slice_start,
ulong prm_1_slice_length0,
long prm_1_slice_stride0,
ulong prm_1_slice_length1,
long prm_1_slice_stride1,
ulong prm_1_slice_length2,
long prm_1_slice_stride2,
global double * prm_2_expr_1,
ulong prm_2_slice_start,
ulong prm_2_slice_length0,
long prm_2_slice_stride0,
ulong prm_2_slice_length1,
long prm_2_slice_stride1,
ulong prm_2_slice_length2,
long prm_2_slice_stride2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, prm_1_slice_length1, prm_1_slice_stride1, prm_1_slice_length2, prm_1_slice_stride2, idx)] = prm_2_expr_1[prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, prm_2_slice_length1, prm_2_slice_stride1, prm_2_slice_length2, prm_2_slice_stride2, idx)];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int SUM_int
(
int prm1,
int prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2_1,
ulong prm_2_start,
ulong prm_2_length0,
long prm_2_stride0,
ulong prm_2_length1,
long prm_2_stride1,
ulong prm_2_length2,
long prm_2_stride2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
int prm_2_sum = (int)0;
{
size_t pos = idx;
size_t ptr2 = prm_2_start + (pos % prm_2_length1) * prm_2_stride1;
pos /= prm_2_length1;
ptr2 += (pos % prm_2_length0) * prm_2_stride0;
for(size_t i2 = 0, ptr3 = ptr2; i2 < prm_2_length2; ++i2, ptr3 += prm_2_stride2)
{
size_t idx = ptr3;
prm_2_sum = SUM_int(prm_2_sum, prm_2_1[idx]);
}
}
prm_1[idx] = prm_2_sum;
}
}
*** No errors detected
<end of output>
Test time = 0.03 sec
----------------------------------------------------------
Test Passed.
"multi_array" end time: Jan 30 13:43 IST
"multi_array" time elapsed: 00:00:00
----------------------------------------------------------
16/30 Testing: spmv
16/30 Test: spmv
Command: "/tmp/vexcl/build/tests/spmv"
Directory: /tmp/vexcl/build/tests
"spmv" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605599
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 12 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong MAX_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global ulong * prm_1,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MAX_ulong(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_expr_1,
global ulong * prm_2_slice_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[prm_2_slice_1[idx]];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void csr_spmv
(
ulong n,
double scale,
global const ulong * row,
global const ulong * col,
global const double * val,
global const double * in,
global double * out
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong i = chunk_start; i < chunk_end; ++i)
{
double sum = 0;
for(size_t j = row[i], e = row[i + 1]; j < e; ++j)
{
sum += val[j] * in[col[j]];
}
out[i] = scale * sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void csr_spmv
(
ulong n,
double scale,
global const ulong * row,
global const ulong * col,
global const double * val,
global const double * in,
global double * out
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong i = chunk_start; i < chunk_end; ++i)
{
double sum = 0;
for(size_t j = row[i], e = row[i + 1]; j < e; ++j)
{
sum += val[j] * in[col[j]];
}
out[i] += scale * sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2[idx];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong MAX_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 > prm2 ? prm1 : prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global int * prm_1,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = MAX_ulong(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_expr_1,
global int * prm_2_slice_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[prm_2_slice_1[idx]];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void csr_spmv
(
ulong n,
double scale,
global const uint * row,
global const int * col,
global const double * val,
global const double * in,
global double * out
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong i = chunk_start; i < chunk_end; ++i)
{
double sum = 0;
for(size_t j = row[i], e = row[i + 1]; j < e; ++j)
{
sum += val[j] * in[col[j]];
}
out[i] = scale * sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void csr_spmv
(
ulong n,
double scale,
global const uint * row,
global const int * col,
global const double * val,
global const double * in,
global double * out
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong i = chunk_start; i < chunk_end; ++i)
{
double sum = 0;
for(size_t j = row[i], e = row[i + 1]; j < e; ++j)
{
sum += val[j] * in[col[j]];
}
out[i] += scale * sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double prm_2_spmv
(
global const ulong * idx,
global const ulong * row,
global const int * col,
global const double * val,
global const double * vec,
ulong i
)
{
double sum = 0;
for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j)
{
sum += val[j] * vec[i + col[j]];
}
return sum;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global const ulong * prm_2_idx,
global const ulong * prm_2_row,
global const int * prm_2_col,
global const double * prm_2_val,
global const double * prm_2_vec
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_spmv(prm_2_idx, prm_2_row, prm_2_col, prm_2_val, prm_2_vec, idx);
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double prm_3_spmv
(
global const ulong * idx,
global const ulong * row,
global const int * col,
global const double * val,
global const double * vec,
ulong i
)
{
double sum = 0;
for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j)
{
sum += val[j] * vec[i + col[j]];
}
return sum;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
global const ulong * prm_3_idx,
global const ulong * prm_3_row,
global const int * prm_3_col,
global const double * prm_3_val,
global const double * prm_3_vec
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3_spmv(prm_3_idx, prm_3_row, prm_3_col, prm_3_val, prm_3_vec, idx) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double prm_2_csr_spmv
(
global const ulong * row,
global const ulong * col,
global const double * val,
global const double * in,
ulong i
)
{
double sum = 0;
for(size_t j = row[i], e = row[i + 1]; j < e; ++j)
{
sum += val[j] * in[col[j]];
}
return sum;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global const ulong * prm_2_row,
global const ulong * prm_2_col,
global const double * prm_2_val,
global const double * prm_2_vec
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = sin( prm_2_csr_spmv(prm_2_row, prm_2_col, prm_2_val, prm_2_vec, idx) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2[idx];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double prm_2_csr_spmv
(
global const ulong * row,
global const ulong * col,
global const double * val,
global const double * in,
ulong i
)
{
double sum = 0;
for(size_t j = row[i], e = row[i + 1]; j < e; ++j)
{
sum += val[j] * in[col[j]];
}
return sum;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global const ulong * prm_2_row,
global const ulong * prm_2_col,
global const double * prm_2_val,
global const double * prm_2_vec
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = cos( prm_2_csr_spmv(prm_2_row, prm_2_col, prm_2_val, prm_2_vec, idx) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double prm_2_spmv
(
global const ulong * idx,
global const ulong * row,
global const int * col,
global const double * val,
global const double * vec,
ulong i
)
{
double sum = 0;
for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j)
{
sum += val[j] * vec[i + col[j]];
}
return sum;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global const ulong * prm_2_idx,
global const ulong * prm_2_row,
global const int * prm_2_col,
global const double * prm_2_val,
global const double * prm_2_vec
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_spmv(prm_2_idx, prm_2_row, prm_2_col, prm_2_val, prm_2_vec, idx);
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double prm_3_spmv
(
global const ulong * idx,
global const ulong * row,
global const int * col,
global const double * val,
global const double * vec,
ulong i
)
{
double sum = 0;
for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j)
{
sum += val[j] * vec[i + col[j]];
}
return sum;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
global const ulong * prm_3_idx,
global const ulong * prm_3_row,
global const int * prm_3_col,
global const double * prm_3_val,
global const double * prm_3_vec
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3_spmv(prm_3_idx, prm_3_row, prm_3_col, prm_3_val, prm_3_vec, idx) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double2 * prm_1,
global double2 * prm_2_expr_1,
global ulong * prm_2_slice_1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_expr_1[prm_2_slice_1[idx]];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void csr_spmv
(
ulong n,
double scale,
global const ulong * row,
global const ulong * col,
global const double2 * val,
global const double2 * in,
global double2 * out
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong i = chunk_start; i < chunk_end; ++i)
{
double2 sum = 0;
for(size_t j = row[i], e = row[i + 1]; j < e; ++j)
{
sum += val[j] * in[col[j]];
}
out[i] = scale * sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void csr_spmv
(
ulong n,
double scale,
global const ulong * row,
global const ulong * col,
global const double2 * val,
global const double2 * in,
global double2 * out
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong i = chunk_start; i < chunk_end; ++i)
{
double2 sum = 0;
for(size_t j = row[i], e = row[i + 1]; j < e; ++j)
{
sum += val[j] * in[col[j]];
}
out[i] += scale * sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double2 prm_2_spmv
(
global const ulong * idx,
global const ulong * row,
global const int * col,
global const double2 * val,
global const double2 * vec,
ulong i
)
{
double2 sum = 0;
for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j)
{
sum += val[j] * vec[i + col[j]];
}
return sum;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double2 * prm_1,
global const ulong * prm_2_idx,
global const ulong * prm_2_row,
global const int * prm_2_col,
global const double2 * prm_2_val,
global const double2 * prm_2_vec
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2_spmv(prm_2_idx, prm_2_row, prm_2_col, prm_2_val, prm_2_vec, idx);
}
}
*** No errors detected
<end of output>
Test time = 0.13 sec
----------------------------------------------------------
Test Passed.
"spmv" end time: Jan 30 13:43 IST
"spmv" time elapsed: 00:00:00
----------------------------------------------------------
17/30 Testing: stencil
17/30 Test: stencil
Command: "/tmp/vexcl/build/tests/stencil"
Directory: /tmp/vexcl/build/tests
"stencil" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605600
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 7 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double read_x
(
long g_id,
ulong n,
char has_left,
char has_right,
int lhalo,
int rhalo,
global const double * xloc,
global const double * xrem
)
{
if (g_id >= 0 && g_id < n)
{
return xloc[g_id];
}
else if (g_id < 0)
{
if (has_left) return (lhalo + g_id >= 0) ? xrem[lhalo + g_id] : 0;
else return xloc[0];
}
else
{
if (has_right) return (g_id < n + rhalo) ? xrem[lhalo + g_id - n] : 0;
else return xloc[n - 1];
}
}
kernel void fast_conv
(
ulong n,
char has_left,
char has_right,
int lhalo,
int rhalo,
global const double * s,
global const double * xloc,
global const double * xrem,
global double * y,
double alpha,
double beta,
local double * smem
)
{
local double * S = smem;
local double * X = smem + lhalo + rhalo + 1;
size_t grid_size = get_global_size(0);
int l_id = get_local_id(0);
int block_size = get_local_size(0);
for(int i = l_id; i < rhalo + lhalo + 1; i += block_size) S[i] = s[i];
for(long g_id = get_global_id(0), pos = 0; pos < n; g_id += grid_size, pos += grid_size)
{
for(int i = l_id, j = g_id - lhalo; i < block_size + lhalo + rhalo; i += block_size, j += block_size)
{
X[i] = read_x(j, n, has_left, has_right, lhalo, rhalo, xloc, xrem);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (g_id < n)
{
double sum = 0;
for(int j = -lhalo; j <= rhalo; j++)
{
sum += S[lhalo + j] * X[lhalo + l_id + j];
}
if (alpha) y[g_id] = alpha * y[g_id] + beta * sum;
else y[g_id] = beta * sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double read_x
(
long g_id,
ulong n,
char has_left,
char has_right,
int lhalo,
int rhalo,
global const double * xloc,
global const double * xrem
)
{
if (g_id >= 0 && g_id < n)
{
return xloc[g_id];
}
else if (g_id < 0)
{
if (has_left) return (lhalo + g_id >= 0) ? xrem[lhalo + g_id] : 0;
else return xloc[0];
}
else
{
if (has_right) return (g_id < n + rhalo) ? xrem[lhalo + g_id - n] : 0;
else return xloc[n - 1];
}
}
kernel void slow_conv
(
ulong n,
char has_left,
char has_right,
int lhalo,
int rhalo,
global const double * s,
global const double * xloc,
global const double * xrem,
global double * y,
double alpha,
double beta
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double sum = 0;
for(int j = -lhalo; j <= rhalo; j++)
{
sum += s[lhalo + j] * read_x((long)idx + j, n, has_left, has_right, lhalo, rhalo, xloc, xrem);
}
if (alpha) y[idx] = alpha * y[idx] + beta * sum;
else y[idx] = beta * sum;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
int prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double read_x
(
long g_id,
ulong n,
char has_left,
char has_right,
int lhalo,
int rhalo,
global const double * xloc,
global const double * xrem
)
{
if (g_id >= 0 && g_id < n)
{
return xloc[g_id];
}
else if (g_id < 0)
{
if (has_left) return (lhalo + g_id >= 0) ? xrem[lhalo + g_id] : 0;
else return xloc[0];
}
else
{
if (has_right) return (g_id < n + rhalo) ? xrem[lhalo + g_id - n] : 0;
else return xloc[n - 1];
}
}
double stencil_oper
(
local const double * X
)
{
return sin(X[1] - X[0]) + sin(X[0] - X[-1]);
}
kernel void convolve
(
ulong n,
char has_left,
char has_right,
int lhalo,
int rhalo,
global const double * xloc,
global const double * xrem,
global double * y,
double alpha,
double beta,
local double * smem
)
{
local double * X = smem;
size_t grid_size = get_global_size(0);
int l_id = get_local_id(0);
int block_size = get_local_size(0);
for(long g_id = get_global_id(0), pos = 0; pos < n; g_id += grid_size, pos += grid_size)
{
for(int i = l_id, j = g_id - lhalo; i < block_size + lhalo + rhalo; i += block_size, j += block_size)
{
X[i] = read_x(j, n, has_left, has_right, lhalo, rhalo, xloc, xrem);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (g_id < n)
{
double sum = stencil_oper(X + lhalo + l_id);
if (alpha) y[g_id] = alpha * y[g_id] + beta * sum;
else y[g_id] = beta * sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
*** No errors detected
<end of output>
Test time = 0.09 sec
----------------------------------------------------------
Test Passed.
"stencil" end time: Jan 30 13:43 IST
"stencil" time elapsed: 00:00:00
----------------------------------------------------------
18/30 Testing: generator
18/30 Test: generator
Command: "/tmp/vexcl/build/tests/generator"
Directory: /tmp/vexcl/build/tests
"generator" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605600
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 7 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void rk2_stepper
(
ulong n,
global double * p_var1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{ double var1 = p_var1[idx];
double var2 = sin( var1 );
double var3 = ( 1.000000000000e-02 * var2 );
double var4 = ( var1 + ( 5.000000000000e-01 * var3 ) );
double var5 = sin( var4 );
double var6 = ( 1.000000000000e-02 * var5 );
var1 = ( var1 + var6 );
p_var1[idx] = var1;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void rk2_stepper
(
ulong n,
global double * p_var1
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{ double var1 = p_var1[idx];
double var2 = sin( var1 );
double var3 = ( 1.000000000000e-02 * var2 );
double var4 = ( var1 + ( 5.000000000000e-01 * var3 ) );
double var5 = sin( var4 );
double var6 = ( 1.000000000000e-02 * var5 );
var1 = ( var1 + var6 );
p_var1[idx] = var1;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double sin2
(
double x
)
{
double s = sin(x); return s * s;
}
kernel void test_sin2
(
ulong n,
const global double * p_var7,
global double * p_var8
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{ double var7 = p_var7[idx];
double var8 = p_var8[idx];
var8 = sin2( var7 );
p_var8[idx] = var8;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double sin2
(
double x
)
{
double s = sin(x); return s * s;
}
kernel void test_sin2
(
ulong n,
const global double * p_var7,
global double * p_var8
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{ double var7 = p_var7[idx];
double var8 = p_var8[idx];
var8 = sin2( var7 );
p_var8[idx] = var8;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double rk2
(
double prm1
)
{
double var9 = prm1;
double var10 = sin( var9 );
double var11 = ( 1.000000000000e-02 * var10 );
double var12 = ( var9 + ( 5.000000000000e-01 * var11 ) );
double var13 = sin( var12 );
double var14 = ( 1.000000000000e-02 * var13 );
var9 = ( var9 + var14 );
return var9;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = rk2( prm_2[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double generated_function_1
(
double prm1
)
{
double var15 = prm1;
double var16 = sin( var15 );
double var17 = ( 1.000000000000e-02 * var16 );
double var18 = ( var15 + ( 5.000000000000e-01 * var17 ) );
double var19 = sin( var18 );
double var20 = ( 1.000000000000e-02 * var19 );
var15 = ( var15 + var20 );
return var15;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = generated_function_1( prm_2[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double generated_function_1
(
double prm1,
double prm2
)
{
double var21 = prm1;
double var22 = prm2;
double var23 = ( ( var21 * var21 ) + ( var22 * var22 ) );
return var23;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
global double * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = generated_function_1( prm_2[idx], prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_tag_1_1,
double prm_tag_2_1,
double prm_5
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_tag_1_1[idx] + ( prm_tag_2_1 * sin( ( prm_tag_1_1[idx] + ( prm_5 * ( prm_tag_2_1 * sin( prm_tag_1_1[idx] ) ) ) ) ) ) );
}
}
*** No errors detected
<end of output>
Test time = 0.05 sec
----------------------------------------------------------
Test Passed.
"generator" end time: Jan 30 13:43 IST
"generator" time elapsed: 00:00:00
----------------------------------------------------------
19/30 Testing: mba
19/30 Test: mba
Command: "/tmp/vexcl/build/tests/mba"
Directory: /tmp/vexcl/build/tests
"mba" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605600
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 2 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
double prm_2,
ulong prm_3,
double prm_4
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( ( prm_2 * (prm_3 + idx) ) / prm_4 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double prm_2_B0
(
double t
)
{
return (t * (t * (-t + 3) - 3) + 1) / 6;
}
double prm_2_B1
(
double t
)
{
return (t * t * (3 * t - 6) + 4) / 6;
}
double prm_2_B2
(
double t
)
{
return (t * (t * (-3 * t + 3) + 3) + 1) / 6;
}
double prm_2_B3
(
double t
)
{
return t * t * t / 6;
}
double prm_2_mba
(
double x0,
double x1,
double c0,
double h0,
ulong n0,
ulong m0,
double c1,
double h1,
ulong n1,
ulong m1,
global const double * phi
)
{
double u;
u = (x0 - c0) * h0;
ulong i0 = floor(u) - 1;
double s0 = u - floor(u);
u = (x1 - c1) * h1;
ulong i1 = floor(u) - 1;
double s1 = u - floor(u);
double f = 0;
ulong j, idx;
idx = 0;
j = i0 + 0;
if (j < n0)
{
idx += j * m0;
j = i1 + 0;
if (j < n1)
{
idx += j * m1;
f += prm_2_B0(s0) * prm_2_B0(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 0;
if (j < n0)
{
idx += j * m0;
j = i1 + 1;
if (j < n1)
{
idx += j * m1;
f += prm_2_B0(s0) * prm_2_B1(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 0;
if (j < n0)
{
idx += j * m0;
j = i1 + 2;
if (j < n1)
{
idx += j * m1;
f += prm_2_B0(s0) * prm_2_B2(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 0;
if (j < n0)
{
idx += j * m0;
j = i1 + 3;
if (j < n1)
{
idx += j * m1;
f += prm_2_B0(s0) * prm_2_B3(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 1;
if (j < n0)
{
idx += j * m0;
j = i1 + 0;
if (j < n1)
{
idx += j * m1;
f += prm_2_B1(s0) * prm_2_B0(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 1;
if (j < n0)
{
idx += j * m0;
j = i1 + 1;
if (j < n1)
{
idx += j * m1;
f += prm_2_B1(s0) * prm_2_B1(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 1;
if (j < n0)
{
idx += j * m0;
j = i1 + 2;
if (j < n1)
{
idx += j * m1;
f += prm_2_B1(s0) * prm_2_B2(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 1;
if (j < n0)
{
idx += j * m0;
j = i1 + 3;
if (j < n1)
{
idx += j * m1;
f += prm_2_B1(s0) * prm_2_B3(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 2;
if (j < n0)
{
idx += j * m0;
j = i1 + 0;
if (j < n1)
{
idx += j * m1;
f += prm_2_B2(s0) * prm_2_B0(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 2;
if (j < n0)
{
idx += j * m0;
j = i1 + 1;
if (j < n1)
{
idx += j * m1;
f += prm_2_B2(s0) * prm_2_B1(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 2;
if (j < n0)
{
idx += j * m0;
j = i1 + 2;
if (j < n1)
{
idx += j * m1;
f += prm_2_B2(s0) * prm_2_B2(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 2;
if (j < n0)
{
idx += j * m0;
j = i1 + 3;
if (j < n1)
{
idx += j * m1;
f += prm_2_B2(s0) * prm_2_B3(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 3;
if (j < n0)
{
idx += j * m0;
j = i1 + 0;
if (j < n1)
{
idx += j * m1;
f += prm_2_B3(s0) * prm_2_B0(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 3;
if (j < n0)
{
idx += j * m0;
j = i1 + 1;
if (j < n1)
{
idx += j * m1;
f += prm_2_B3(s0) * prm_2_B1(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 3;
if (j < n0)
{
idx += j * m0;
j = i1 + 2;
if (j < n1)
{
idx += j * m1;
f += prm_2_B3(s0) * prm_2_B2(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 3;
if (j < n0)
{
idx += j * m0;
j = i1 + 3;
if (j < n1)
{
idx += j * m1;
f += prm_2_B3(s0) * prm_2_B3(s1) * phi[idx];
}
}
return f;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2_x0_1,
double prm_2_x1_1,
ulong prm_2_x1_2,
double prm_2_x1_3,
double prm_2_c0,
double prm_2_h0,
ulong prm_2_n0,
ulong prm_2_m0,
double prm_2_c1,
double prm_2_h1,
ulong prm_2_n1,
ulong prm_2_m1,
global const double * prm_2_phi
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = sin( prm_2_mba(prm_2_x0_1[idx], ( ( prm_2_x1_1 * (prm_2_x1_2 + idx) ) / prm_2_x1_3 ), prm_2_c0, prm_2_h0, prm_2_n0, prm_2_m0, prm_2_c1, prm_2_h1, prm_2_n1, prm_2_m1, prm_2_phi) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double prm_2_B0
(
double t
)
{
return (t * (t * (-t + 3) - 3) + 1) / 6;
}
double prm_2_B1
(
double t
)
{
return (t * t * (3 * t - 6) + 4) / 6;
}
double prm_2_B2
(
double t
)
{
return (t * (t * (-3 * t + 3) + 3) + 1) / 6;
}
double prm_2_B3
(
double t
)
{
return t * t * t / 6;
}
double prm_2_mba
(
double x0,
double x1,
double c0,
double h0,
ulong n0,
ulong m0,
double c1,
double h1,
ulong n1,
ulong m1,
global const double * phi
)
{
double u;
u = (x0 - c0) * h0;
ulong i0 = floor(u) - 1;
double s0 = u - floor(u);
u = (x1 - c1) * h1;
ulong i1 = floor(u) - 1;
double s1 = u - floor(u);
double f = 0;
ulong j, idx;
idx = 0;
j = i0 + 0;
if (j < n0)
{
idx += j * m0;
j = i1 + 0;
if (j < n1)
{
idx += j * m1;
f += prm_2_B0(s0) * prm_2_B0(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 0;
if (j < n0)
{
idx += j * m0;
j = i1 + 1;
if (j < n1)
{
idx += j * m1;
f += prm_2_B0(s0) * prm_2_B1(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 0;
if (j < n0)
{
idx += j * m0;
j = i1 + 2;
if (j < n1)
{
idx += j * m1;
f += prm_2_B0(s0) * prm_2_B2(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 0;
if (j < n0)
{
idx += j * m0;
j = i1 + 3;
if (j < n1)
{
idx += j * m1;
f += prm_2_B0(s0) * prm_2_B3(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 1;
if (j < n0)
{
idx += j * m0;
j = i1 + 0;
if (j < n1)
{
idx += j * m1;
f += prm_2_B1(s0) * prm_2_B0(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 1;
if (j < n0)
{
idx += j * m0;
j = i1 + 1;
if (j < n1)
{
idx += j * m1;
f += prm_2_B1(s0) * prm_2_B1(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 1;
if (j < n0)
{
idx += j * m0;
j = i1 + 2;
if (j < n1)
{
idx += j * m1;
f += prm_2_B1(s0) * prm_2_B2(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 1;
if (j < n0)
{
idx += j * m0;
j = i1 + 3;
if (j < n1)
{
idx += j * m1;
f += prm_2_B1(s0) * prm_2_B3(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 2;
if (j < n0)
{
idx += j * m0;
j = i1 + 0;
if (j < n1)
{
idx += j * m1;
f += prm_2_B2(s0) * prm_2_B0(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 2;
if (j < n0)
{
idx += j * m0;
j = i1 + 1;
if (j < n1)
{
idx += j * m1;
f += prm_2_B2(s0) * prm_2_B1(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 2;
if (j < n0)
{
idx += j * m0;
j = i1 + 2;
if (j < n1)
{
idx += j * m1;
f += prm_2_B2(s0) * prm_2_B2(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 2;
if (j < n0)
{
idx += j * m0;
j = i1 + 3;
if (j < n1)
{
idx += j * m1;
f += prm_2_B2(s0) * prm_2_B3(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 3;
if (j < n0)
{
idx += j * m0;
j = i1 + 0;
if (j < n1)
{
idx += j * m1;
f += prm_2_B3(s0) * prm_2_B0(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 3;
if (j < n0)
{
idx += j * m0;
j = i1 + 1;
if (j < n1)
{
idx += j * m1;
f += prm_2_B3(s0) * prm_2_B1(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 3;
if (j < n0)
{
idx += j * m0;
j = i1 + 2;
if (j < n1)
{
idx += j * m1;
f += prm_2_B3(s0) * prm_2_B2(s1) * phi[idx];
}
}
idx = 0;
j = i0 + 3;
if (j < n0)
{
idx += j * m0;
j = i1 + 3;
if (j < n1)
{
idx += j * m1;
f += prm_2_B3(s0) * prm_2_B3(s1) * phi[idx];
}
}
return f;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
double prm_2_x0_1_1,
ulong prm_2_x0_1_2,
double prm_2_x0_1_3,
double prm_2_c0,
double prm_2_h0,
ulong prm_2_n0,
ulong prm_2_m0,
double prm_2_c1,
double prm_2_h1,
ulong prm_2_n1,
ulong prm_2_m1,
global const double * prm_2_phi
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double temp_1 = ( ( prm_2_x0_1_1 * (prm_2_x0_1_2 + idx) ) / prm_2_x0_1_3 );
prm_1[idx] = prm_2_mba(temp_1, temp_1, prm_2_c0, prm_2_h0, prm_2_n0, prm_2_m0, prm_2_c1, prm_2_h1, prm_2_n1, prm_2_m1, prm_2_phi);
}
}
*** No errors detected
<end of output>
Test time = 0.03 sec
----------------------------------------------------------
Test Passed.
"mba" end time: Jan 30 13:43 IST
"mba" time elapsed: 00:00:00
----------------------------------------------------------
20/30 Testing: random
20/30 Test: random
Command: "/tmp/vexcl/build/tests/random"
Directory: /tmp/vexcl/build/tests
"random" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605600
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 3 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
void philox_uint_2_10
(
uint * ctr,
uint * key
)
{
uint m[2];
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
}
int random_int_philox
(
ulong prm1,
ulong prm2
)
{
union
{
uint ctr[2];
int res;
} ctr;
uint key[1];
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2;
key[0] = 0x12345678;
philox_uint_2_10(ctr.ctr, key);
return ctr.res;
}
kernel void vexcl_vector_kernel
(
ulong n,
global uint * prm_1,
ulong prm_2,
int prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = random_int_philox( (prm_2 + idx), prm_3 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
void philox_uint_4_10
(
uint * ctr,
uint * key
)
{
uint m[4];
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
}
float4 random_float4_philox
(
ulong prm1,
ulong prm2
)
{
union
{
uint ctr[4];
uint res_i[4];
float res_f[4];
float4 res;
} ctr;
uint key[2];
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2;
ctr.ctr[2] = prm1; ctr.ctr[3] = prm2;
key[0] = 0x12345678;
key[1] = 0x12345678;
philox_uint_4_10(ctr.ctr, key);
ctr.res_f[0] = ctr.res_i[0] / 4294967295.0f;
ctr.res_f[1] = ctr.res_i[1] / 4294967295.0f;
ctr.res_f[2] = ctr.res_i[2] / 4294967295.0f;
ctr.res_f[3] = ctr.res_i[3] / 4294967295.0f;
return ctr.res;
}
kernel void vexcl_vector_kernel
(
ulong n,
global float4 * prm_1,
ulong prm_2,
int prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = random_float4_philox( (prm_2 + idx), prm_3 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
void philox_ulong_4_10
(
ulong * ctr,
ulong * key
)
{
ulong m[4];
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]);
m[1] = 0xD2E7470EE14C6C93 * ctr[0];
m[2] = mul_hi(0xCA5A826395121157, ctr[2]);
m[3] = 0xCA5A826395121157 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B97F4A7C15;
key[1] += 0xBB67AE8584CAA73B;
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]);
m[1] = 0xD2E7470EE14C6C93 * ctr[0];
m[2] = mul_hi(0xCA5A826395121157, ctr[2]);
m[3] = 0xCA5A826395121157 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B97F4A7C15;
key[1] += 0xBB67AE8584CAA73B;
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]);
m[1] = 0xD2E7470EE14C6C93 * ctr[0];
m[2] = mul_hi(0xCA5A826395121157, ctr[2]);
m[3] = 0xCA5A826395121157 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B97F4A7C15;
key[1] += 0xBB67AE8584CAA73B;
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]);
m[1] = 0xD2E7470EE14C6C93 * ctr[0];
m[2] = mul_hi(0xCA5A826395121157, ctr[2]);
m[3] = 0xCA5A826395121157 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B97F4A7C15;
key[1] += 0xBB67AE8584CAA73B;
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]);
m[1] = 0xD2E7470EE14C6C93 * ctr[0];
m[2] = mul_hi(0xCA5A826395121157, ctr[2]);
m[3] = 0xCA5A826395121157 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B97F4A7C15;
key[1] += 0xBB67AE8584CAA73B;
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]);
m[1] = 0xD2E7470EE14C6C93 * ctr[0];
m[2] = mul_hi(0xCA5A826395121157, ctr[2]);
m[3] = 0xCA5A826395121157 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B97F4A7C15;
key[1] += 0xBB67AE8584CAA73B;
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]);
m[1] = 0xD2E7470EE14C6C93 * ctr[0];
m[2] = mul_hi(0xCA5A826395121157, ctr[2]);
m[3] = 0xCA5A826395121157 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B97F4A7C15;
key[1] += 0xBB67AE8584CAA73B;
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]);
m[1] = 0xD2E7470EE14C6C93 * ctr[0];
m[2] = mul_hi(0xCA5A826395121157, ctr[2]);
m[3] = 0xCA5A826395121157 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B97F4A7C15;
key[1] += 0xBB67AE8584CAA73B;
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]);
m[1] = 0xD2E7470EE14C6C93 * ctr[0];
m[2] = mul_hi(0xCA5A826395121157, ctr[2]);
m[3] = 0xCA5A826395121157 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B97F4A7C15;
key[1] += 0xBB67AE8584CAA73B;
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]);
m[1] = 0xD2E7470EE14C6C93 * ctr[0];
m[2] = mul_hi(0xCA5A826395121157, ctr[2]);
m[3] = 0xCA5A826395121157 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
}
double4 random_double4_philox
(
ulong prm1,
ulong prm2
)
{
union
{
ulong ctr[4];
ulong res_i[4];
double res_f[4];
double4 res;
} ctr;
ulong key[2];
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2;
ctr.ctr[2] = prm1; ctr.ctr[3] = prm2;
key[0] = 0x12345678;
key[1] = 0x12345678;
philox_ulong_4_10(ctr.ctr, key);
ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0;
ctr.res_f[1] = ctr.res_i[1] / 18446744073709551615.0;
ctr.res_f[2] = ctr.res_i[2] / 18446744073709551615.0;
ctr.res_f[3] = ctr.res_i[3] / 18446744073709551615.0;
return ctr.res;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double4 * prm_1,
ulong prm_2,
int prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = random_double4_philox( (prm_2 + idx), prm_3 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
void philox_uint_2_10
(
uint * ctr,
uint * key
)
{
uint m[2];
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
}
double random_double_philox
(
ulong prm1,
ulong prm2
)
{
union
{
uint ctr[2];
ulong res_i[1];
double res_f[1];
double res;
} ctr;
uint key[1];
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2;
key[0] = 0x12345678;
philox_uint_2_10(ctr.ctr, key);
ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0;
return ctr.res;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
ulong prm_2,
int prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = random_double_philox( (prm_2 + idx), prm_3 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong SUM_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
int prm_2,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_ulong(mySum, ( prm_1[idx] > prm_2 ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong SUM_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
int prm_2,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_ulong(mySum, ( prm_1[idx] < prm_2 ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double SUM_double
(
double prm1,
double prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * g_odata
)
{
double mySum = (double)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_double(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
void philox_uint_4_10
(
uint * ctr,
uint * key
)
{
uint m[4];
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
key[0] += 0x9E3779B9;
key[1] += 0xBB67AE85;
m[0] = mul_hi(0xD2511F53, ctr[0]);
m[1] = 0xD2511F53 * ctr[0];
m[2] = mul_hi(0xCD9E8D57, ctr[2]);
m[3] = 0xCD9E8D57 * ctr[2];
ctr[0] = m[2] ^ ctr[1] ^ key[0];
ctr[1] = m[3];
ctr[2] = m[0] ^ ctr[3] ^ key[1];
ctr[3] = m[1];
}
double random_normal_double_philox
(
ulong prm1,
ulong prm2
)
{
union
{
uint ctr[4];
ulong res_i[2];
} ctr;
double u[2];
uint key[2];
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2;
ctr.ctr[2] = prm1; ctr.ctr[3] = prm2;
key[0] = 0x12345678;
key[1] = 0x12345678;
philox_uint_4_10(ctr.ctr, key);
u[0] = ctr.res_i[0] / 18446744073709551615.0;
u[1] = ctr.res_i[1] / 18446744073709551615.0;
return sqrt(-2 * log(u[0])) * cospi(2 * u[1]);
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
ulong prm_2,
int prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = random_normal_double_philox( (prm_2 + idx), prm_3 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double SUM_double
(
double prm1,
double prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double * prm_1,
global double * g_odata
)
{
double mySum = (double)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_double(mySum, fabs( prm_1[idx] ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
void threefry_uint_2_20
(
uint * ctr,
uint * key
)
{
const uint p = 0x1BD11BDA ^ key[0] ^ key[1];
ctr[0] += key[0];
ctr[1] += key[1];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0];
ctr[0] += key[1];
ctr[1] += p; ctr[1] += 1;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 17u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 29u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 16u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 24u); ctr[1] ^= ctr[0];
ctr[0] += p;
ctr[1] += key[0]; ctr[1] += 2;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0];
ctr[0] += key[0];
ctr[1] += key[1]; ctr[1] += 3;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 17u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 29u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 16u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 24u); ctr[1] ^= ctr[0];
ctr[0] += key[1];
ctr[1] += p; ctr[1] += 4;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0];
ctr[0] += p;
ctr[1] += key[0]; ctr[1] += 5;
}
double random_double_threefry
(
ulong prm1,
ulong prm2
)
{
union
{
uint ctr[2];
ulong res_i[1];
double res_f[1];
double res;
} ctr;
uint key[2];
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2;
key[0] = 0x12345678;
key[1] = 0x12345678;
threefry_uint_2_20(ctr.ctr, key);
ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0;
return ctr.res;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
ulong prm_2,
int prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = random_double_threefry( (prm_2 + idx), prm_3 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong SUM_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 + prm2;
}
void threefry_uint_2_20
(
uint * ctr,
uint * key
)
{
const uint p = 0x1BD11BDA ^ key[0] ^ key[1];
ctr[0] += key[0];
ctr[1] += key[1];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0];
ctr[0] += key[1];
ctr[1] += p; ctr[1] += 1;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 17u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 29u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 16u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 24u); ctr[1] ^= ctr[0];
ctr[0] += p;
ctr[1] += key[0]; ctr[1] += 2;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0];
ctr[0] += key[0];
ctr[1] += key[1]; ctr[1] += 3;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 17u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 29u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 16u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 24u); ctr[1] ^= ctr[0];
ctr[0] += key[1];
ctr[1] += p; ctr[1] += 4;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0];
ctr[0] += p;
ctr[1] += key[0]; ctr[1] += 5;
}
double random_double_threefry
(
ulong prm1,
ulong prm2
)
{
union
{
uint ctr[2];
ulong res_i[1];
double res_f[1];
double res;
} ctr;
uint key[2];
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2;
key[0] = 0x12345678;
key[1] = 0x12345678;
threefry_uint_2_20(ctr.ctr, key);
ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0;
return ctr.res;
}
kernel void vexcl_reductor_kernel
(
ulong n,
ulong prm_tag_0_1,
int prm_1_2,
int prm_3_2,
int prm_5,
global ulong * g_odata
)
{
ulong mySum = (ulong)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
double temp_1 = random_double_threefry( (prm_tag_0_1 + idx), prm_1_2 );
double temp_2 = random_double_threefry( (prm_tag_0_1 + idx), prm_3_2 );
mySum = SUM_ulong(mySum, ( ( ( temp_1 * temp_1 ) + ( temp_2 * temp_2 ) ) < prm_5 ));
}
g_odata[get_group_id(0)] = mySum;
}
*** No errors detected
<end of output>
Test time = 0.11 sec
----------------------------------------------------------
Test Passed.
"random" end time: Jan 30 13:43 IST
"random" time elapsed: 00:00:00
----------------------------------------------------------
21/30 Testing: sort
21/30 Test: sort
Command: "/tmp/vexcl/build/tests/sort"
Directory: /tmp/vexcl/build/tests
"sort" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605600
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 6 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
float x,
float y
)
{
return x < y;
}
void global_to_regstr_pred_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_float(count, data, tid, reg);
}
void regstr_to_global_1_11_float
(
int count,
const float * reg,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_float
(
int count,
global const float * source,
int tid,
local float * dest
)
{
float reg[11];
global_to_regstr_1_11_float(count, source, tid, reg);
regstr_to_shared_1_11_float(reg, tid, dest);
}
void shared_to_global_1_11_float
(
int count,
local const float * source,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_int(count, data, tid, reg);
}
void regstr_to_global_1_11_int
(
int count,
const int * reg,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_int
(
int count,
global const int * source,
int tid,
local int * dest
)
{
int reg[11];
global_to_regstr_1_11_int(count, source, tid, reg);
regstr_to_shared_1_11_int(reg, tid, dest);
}
void shared_to_global_1_11_int
(
int count,
local const int * source,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void serial_merge_11_float
(
int a_begin,
int a_end,
int b_begin,
int b_end,
int * indices,
local const float * keys_shared0,
float * results0
)
{
float a_key0 = keys_shared0[a_begin];
float b_key0 = keys_shared0[b_begin];
bool p;
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[0] = p ? a_key0 : b_key0;
indices[0] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[1] = p ? a_key0 : b_key0;
indices[1] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[2] = p ? a_key0 : b_key0;
indices[2] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[3] = p ? a_key0 : b_key0;
indices[3] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[4] = p ? a_key0 : b_key0;
indices[4] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[5] = p ? a_key0 : b_key0;
indices[5] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[6] = p ? a_key0 : b_key0;
indices[6] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[7] = p ? a_key0 : b_key0;
indices[7] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[8] = p ? a_key0 : b_key0;
indices[8] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[9] = p ? a_key0 : b_key0;
indices[9] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[10] = p ? a_key0 : b_key0;
indices[10] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void swap_float
(
float * a0,
float * b0
)
{
{
float c = *a0;
*a0 = *b0;
*b0 = c;
}
}
void odd_even_transpose_sort_11_float
(
float * keys0
)
{
if (comp(keys0[1], keys0[0]))
{
swap_float(keys0 + 0, keys0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_float(keys0 + 2, keys0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_float(keys0 + 4, keys0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_float(keys0 + 6, keys0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_float(keys0 + 8, keys0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_float(keys0 + 1, keys0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_float(keys0 + 3, keys0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_float(keys0 + 5, keys0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_float(keys0 + 7, keys0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_float(keys0 + 9, keys0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_float(keys0 + 0, keys0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_float(keys0 + 2, keys0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_float(keys0 + 4, keys0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_float(keys0 + 6, keys0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_float(keys0 + 8, keys0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_float(keys0 + 1, keys0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_float(keys0 + 3, keys0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_float(keys0 + 5, keys0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_float(keys0 + 7, keys0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_float(keys0 + 9, keys0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_float(keys0 + 0, keys0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_float(keys0 + 2, keys0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_float(keys0 + 4, keys0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_float(keys0 + 6, keys0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_float(keys0 + 8, keys0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_float(keys0 + 1, keys0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_float(keys0 + 3, keys0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_float(keys0 + 5, keys0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_float(keys0 + 7, keys0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_float(keys0 + 9, keys0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_float(keys0 + 0, keys0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_float(keys0 + 2, keys0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_float(keys0 + 4, keys0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_float(keys0 + 6, keys0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_float(keys0 + 8, keys0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_float(keys0 + 1, keys0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_float(keys0 + 3, keys0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_float(keys0 + 5, keys0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_float(keys0 + 7, keys0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_float(keys0 + 9, keys0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_float(keys0 + 0, keys0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_float(keys0 + 2, keys0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_float(keys0 + 4, keys0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_float(keys0 + 6, keys0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_float(keys0 + 8, keys0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_float(keys0 + 1, keys0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_float(keys0 + 3, keys0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_float(keys0 + 5, keys0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_float(keys0 + 7, keys0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_float(keys0 + 9, keys0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_float(keys0 + 0, keys0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_float(keys0 + 2, keys0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_float(keys0 + 4, keys0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_float(keys0 + 6, keys0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_float(keys0 + 8, keys0 + 9);
}
}
int merge_path_float
(
int a_count,
int b_count,
int diag,
local const float * a0,
local const float * b0
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
void block_sort_pass_1_11_float
(
int tid,
int count,
int coop,
int * indices,
local const float * keys_shared0,
float * keys0
)
{
int list = ~(coop - 1) & tid;
int diag = min(count, 11 * ((coop - 1) & tid));
int start = 11 * list;
int a0 = min(count, start);
int b0 = min(count, start + 11 * (coop / 2));
int b1 = min(count, start + 11 * coop);
int p = merge_path_float(b0 - a0, b1 - b0, diag, keys_shared0 + a0, keys_shared0 + b0);
serial_merge_11_float(a0 + p, b0, b0 + diag - p, b1, indices, keys_shared0, keys0);
}
void block_sort_loop_1_11_float
(
int tid,
int count,
local float * keys_shared0
)
{
int indices[11];
float keys0[11];
}
void mergesort_1_11_float
(
int count,
int tid,
float * thread_keys0,
local float * keys_shared0
)
{
if(11 * tid < count) odd_even_transpose_sort_11_float(thread_keys0);
thread_to_shared_11_float(thread_keys0, tid, keys_shared0);
block_sort_loop_1_11_float(tid, count, keys_shared0);
}
kernel void block_sort
(
int count,
global const float * keys_src0,
global float * keys_dst0
)
{
union Shared
{
struct
{
float keys0[12];
};
};
local union Shared shared;
int tid = get_local_id(0);
int block = get_group_id(0);
int gid = 11 * block;
int count2 = min(11, count - gid);
float thread_keys0[11];
global_to_shared_1_11_float(count2, keys_src0 + gid, tid, shared.keys0);
shared_to_thread_11_float(shared.keys0, tid, thread_keys0);
int first = 11 * tid;
if(first + 11 > count2 && first < count2)
{
float max_key0 = thread_keys0[0];
if(first + 1 < count2 && comp(max_key0, thread_keys0[1]) )
{
max_key0 = thread_keys0[1];
}
if(first + 2 < count2 && comp(max_key0, thread_keys0[2]) )
{
max_key0 = thread_keys0[2];
}
if(first + 3 < count2 && comp(max_key0, thread_keys0[3]) )
{
max_key0 = thread_keys0[3];
}
if(first + 4 < count2 && comp(max_key0, thread_keys0[4]) )
{
max_key0 = thread_keys0[4];
}
if(first + 5 < count2 && comp(max_key0, thread_keys0[5]) )
{
max_key0 = thread_keys0[5];
}
if(first + 6 < count2 && comp(max_key0, thread_keys0[6]) )
{
max_key0 = thread_keys0[6];
}
if(first + 7 < count2 && comp(max_key0, thread_keys0[7]) )
{
max_key0 = thread_keys0[7];
}
if(first + 8 < count2 && comp(max_key0, thread_keys0[8]) )
{
max_key0 = thread_keys0[8];
}
if(first + 9 < count2 && comp(max_key0, thread_keys0[9]) )
{
max_key0 = thread_keys0[9];
}
if(first + 10 < count2 && comp(max_key0, thread_keys0[10]) )
{
max_key0 = thread_keys0[10];
}
if(first + 0 >= count2)
{
thread_keys0[0] = max_key0;
}
if(first + 1 >= count2)
{
thread_keys0[1] = max_key0;
}
if(first + 2 >= count2)
{
thread_keys0[2] = max_key0;
}
if(first + 3 >= count2)
{
thread_keys0[3] = max_key0;
}
if(first + 4 >= count2)
{
thread_keys0[4] = max_key0;
}
if(first + 5 >= count2)
{
thread_keys0[5] = max_key0;
}
if(first + 6 >= count2)
{
thread_keys0[6] = max_key0;
}
if(first + 7 >= count2)
{
thread_keys0[7] = max_key0;
}
if(first + 8 >= count2)
{
thread_keys0[8] = max_key0;
}
if(first + 9 >= count2)
{
thread_keys0[9] = max_key0;
}
if(first + 10 >= count2)
{
thread_keys0[10] = max_key0;
}
}
mergesort_1_11_float(count2, tid, thread_keys0, shared.keys0);
shared_to_global_1_11_float(count2, shared.keys0, tid, keys_dst0 + gid);
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
float x,
float y
)
{
return x < y;
}
int4 find_mergesort_frame
(
int coop,
int block,
int nv
)
{
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
int4 frame;
frame.x = nv * start;
frame.y = nv * start + size;
frame.z = size;
return frame;
}
int4 find_mergesort_interval
(
int4 frame,
int coop,
int block,
int nv,
int count,
int mp0,
int mp1
)
{
int diag = nv * block - frame.x;
int4 interval;
interval.x = frame.x + mp0;
interval.y = min(count, frame.x + mp1);
interval.z = min(count, frame.y + diag - mp0);
interval.w = min(count, frame.y + diag + nv - mp1);
if(coop - 1 == ((coop - 1) & block))
{
interval.y = min(count, frame.x + frame.z);
interval.w = min(count, frame.y + frame.z);
}
return interval;
}
int4 compute_merge_range
(
int a_count,
int b_count,
int block,
int coop,
int nv,
global const int * mp_global
)
{
int mp0 = mp_global[block];
int mp1 = mp_global[block + 1];
int gid = nv * block;
int4 range;
if(coop)
{
int4 frame = find_mergesort_frame(coop, block, nv);
range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1);
}
else
{
range.x = mp0;
range.y = mp1;
range.z = gid - range.x;
range.w = min(a_count + b_count, gid + nv) - range.y;
}
return range;
}
void global_to_regstr_pred_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_float(count, data, tid, reg);
}
void regstr_to_global_1_11_float
(
int count,
const float * reg,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_float
(
int count,
global const float * source,
int tid,
local float * dest
)
{
float reg[11];
global_to_regstr_1_11_float(count, source, tid, reg);
regstr_to_shared_1_11_float(reg, tid, dest);
}
void shared_to_global_1_11_float
(
int count,
local const float * source,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_int(count, data, tid, reg);
}
void regstr_to_global_1_11_int
(
int count,
const int * reg,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_int
(
int count,
global const int * source,
int tid,
local int * dest
)
{
int reg[11];
global_to_regstr_1_11_int(count, source, tid, reg);
regstr_to_shared_1_11_int(reg, tid, dest);
}
void shared_to_global_1_11_int
(
int count,
local const int * source,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void serial_merge_11_float
(
int a_begin,
int a_end,
int b_begin,
int b_end,
int * indices,
local const float * keys_shared0,
float * results0
)
{
float a_key0 = keys_shared0[a_begin];
float b_key0 = keys_shared0[b_begin];
bool p;
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[0] = p ? a_key0 : b_key0;
indices[0] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[1] = p ? a_key0 : b_key0;
indices[1] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[2] = p ? a_key0 : b_key0;
indices[2] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[3] = p ? a_key0 : b_key0;
indices[3] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[4] = p ? a_key0 : b_key0;
indices[4] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[5] = p ? a_key0 : b_key0;
indices[5] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[6] = p ? a_key0 : b_key0;
indices[6] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[7] = p ? a_key0 : b_key0;
indices[7] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[8] = p ? a_key0 : b_key0;
indices[8] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[9] = p ? a_key0 : b_key0;
indices[9] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[10] = p ? a_key0 : b_key0;
indices[10] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
int merge_path_float
(
int a_count,
int b_count,
int diag,
local const float * a0,
local const float * b0
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
void load2_to_regstr_1_11_11_float
(
global const float * a_global,
int a_count,
global const float * b_global,
int b_count,
int tid,
float * reg
)
{
b_global -= a_count;
int total = a_count + b_count;
int index;
if (total >= 11)
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else reg[6] = b_global[index];
index = 7 + tid;
if (index < a_count) reg[7] = a_global[index];
else reg[7] = b_global[index];
index = 8 + tid;
if (index < a_count) reg[8] = a_global[index];
else reg[8] = b_global[index];
index = 9 + tid;
if (index < a_count) reg[9] = a_global[index];
else reg[9] = b_global[index];
index = 10 + tid;
if (index < a_count) reg[10] = a_global[index];
else reg[10] = b_global[index];
}
else
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else if (index < total) reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else if (index < total) reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else if (index < total) reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else if (index < total) reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else if (index < total) reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else if (index < total) reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else if (index < total) reg[6] = b_global[index];
index = 7 + tid;
if (index < a_count) reg[7] = a_global[index];
else if (index < total) reg[7] = b_global[index];
index = 8 + tid;
if (index < a_count) reg[8] = a_global[index];
else if (index < total) reg[8] = b_global[index];
index = 9 + tid;
if (index < a_count) reg[9] = a_global[index];
else if (index < total) reg[9] = b_global[index];
index = 10 + tid;
if (index < a_count) reg[10] = a_global[index];
else if (index < total) reg[10] = b_global[index];
}
}
void load2_to_shared_1_11_11_float
(
global const float * a_global,
int a_count,
global const float * b_global,
int b_count,
int tid,
local float * shared
)
{
float reg[11];
load2_to_regstr_1_11_11_float(a_global, a_count, b_global, b_count, tid, reg);
regstr_to_shared_1_11_float(reg, tid, shared);
}
void merge_keys_indices_1_11_float
(
int a_count,
int b_count,
int4 range,
int tid,
int * indices,
global const float * a_global0,
global const float * b_global0,
local float * keys_shared0,
float * results0
)
{
int a0 = range.x;
int a1 = range.y;
int b0 = range.z;
int b1 = range.w;
a_count = a1 - a0;
b_count = b1 - b0;
load2_to_shared_1_11_11_float(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0);
int diag = 11 * tid;
int mp = merge_path_float(a_count, b_count, diag, keys_shared0, keys_shared0 + a_count);
int a0tid = mp;
int a1tid = a_count;
int b0tid = a_count + diag - mp;
int b1tid = a_count + b_count;
serial_merge_11_float(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, results0);
}
void device_merge_1_11_float
(
int a_count,
int b_count,
global const float * a_keys_global0,
global const float * b_keys_global0,
global float * keys_global0,
local float * keys_shared0,
int tid,
int block,
int4 range,
local int * indices_shared
)
{
float results0[11];
int indices[11];
merge_keys_indices_1_11_float(a_count, b_count, range, tid, indices, a_keys_global0, b_keys_global0, keys_shared0, results0);
thread_to_shared_11_float(results0, tid, keys_shared0);
a_count = range.y - range.x;
b_count = range.w - range.z;
shared_to_global_1_11_float(a_count + b_count, keys_shared0, tid, keys_global0 + 11 * block);
}
kernel void merge
(
int a_count,
int b_count,
global const float * a_keys_global0,
global const float * b_keys_global0,
global float * keys_global0,
global const int * mp_global,
int coop
)
{
union Shared
{
struct
{
float keys0[12];
};
int indices[11];
};
local union Shared shared;
int tid = get_local_id(0);
int block = get_group_id(0);
int4 range = compute_merge_range(a_count, b_count, block, coop, 11, mp_global);
device_merge_1_11_float(a_count, b_count, a_keys_global0, b_keys_global0, keys_global0, shared.keys0, tid, block, range, shared.indices);
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
float x,
float y
)
{
return x < y;
}
int merge_path_float
(
int a_count,
int b_count,
int diag,
global const float * a0,
global const float * b0
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
int4 find_mergesort_frame
(
int coop,
int block,
int nv
)
{
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
int4 frame;
frame.x = nv * start;
frame.y = nv * start + size;
frame.z = size;
return frame;
}
kernel void merge_partition
(
int a_count,
int b_count,
int nv,
int coop,
global int * mp_global,
int num_searches,
global const float * a_global0,
global const float * b_global0
)
{
int partition = get_global_id(0);
if (partition < num_searches)
{
int a0 = 0, b0 = 0;
int gid = nv * partition;
if(coop)
{
int4 frame = find_mergesort_frame(coop, partition, nv);
a0 = frame.x;
b0 = min(a_count, frame.y);
b_count = min(a_count, frame.y + frame.z) - b0;
a_count = min(a_count, frame.x + frame.z) - a0;
gid -= a0;
}
int mp = merge_path_float(a_count, b_count, min(gid, a_count + b_count), a_global0 + a0, b_global0 + b0);
mp_global[partition] = mp;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int x,
int y
)
{
return x < y;
}
void global_to_regstr_pred_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_float(count, data, tid, reg);
}
void regstr_to_global_1_11_float
(
int count,
const float * reg,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_float
(
int count,
global const float * source,
int tid,
local float * dest
)
{
float reg[11];
global_to_regstr_1_11_float(count, source, tid, reg);
regstr_to_shared_1_11_float(reg, tid, dest);
}
void shared_to_global_1_11_float
(
int count,
local const float * source,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_int(count, data, tid, reg);
}
void regstr_to_global_1_11_int
(
int count,
const int * reg,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_int
(
int count,
global const int * source,
int tid,
local int * dest
)
{
int reg[11];
global_to_regstr_1_11_int(count, source, tid, reg);
regstr_to_shared_1_11_int(reg, tid, dest);
}
void shared_to_global_1_11_int
(
int count,
local const int * source,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void serial_merge_11_int
(
int a_begin,
int a_end,
int b_begin,
int b_end,
int * indices,
local const int * keys_shared0,
int * results0
)
{
int a_key0 = keys_shared0[a_begin];
int b_key0 = keys_shared0[b_begin];
bool p;
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[0] = p ? a_key0 : b_key0;
indices[0] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[1] = p ? a_key0 : b_key0;
indices[1] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[2] = p ? a_key0 : b_key0;
indices[2] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[3] = p ? a_key0 : b_key0;
indices[3] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[4] = p ? a_key0 : b_key0;
indices[4] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[5] = p ? a_key0 : b_key0;
indices[5] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[6] = p ? a_key0 : b_key0;
indices[6] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[7] = p ? a_key0 : b_key0;
indices[7] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[8] = p ? a_key0 : b_key0;
indices[8] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[9] = p ? a_key0 : b_key0;
indices[9] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[10] = p ? a_key0 : b_key0;
indices[10] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void swap_int
(
int * a0,
int * b0
)
{
{
int c = *a0;
*a0 = *b0;
*b0 = c;
}
}
void swap_float
(
float * a0,
float * b0
)
{
{
float c = *a0;
*a0 = *b0;
*b0 = c;
}
}
void odd_even_transpose_sort_11_int_float
(
int * keys0,
float * vals0
)
{
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_int(keys0 + 1, keys0 + 2);
swap_float(vals0 + 1, vals0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_int(keys0 + 3, keys0 + 4);
swap_float(vals0 + 3, vals0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_int(keys0 + 5, keys0 + 6);
swap_float(vals0 + 5, vals0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_int(keys0 + 7, keys0 + 8);
swap_float(vals0 + 7, vals0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_int(keys0 + 9, keys0 + 10);
swap_float(vals0 + 9, vals0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_int(keys0 + 1, keys0 + 2);
swap_float(vals0 + 1, vals0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_int(keys0 + 3, keys0 + 4);
swap_float(vals0 + 3, vals0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_int(keys0 + 5, keys0 + 6);
swap_float(vals0 + 5, vals0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_int(keys0 + 7, keys0 + 8);
swap_float(vals0 + 7, vals0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_int(keys0 + 9, keys0 + 10);
swap_float(vals0 + 9, vals0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_int(keys0 + 1, keys0 + 2);
swap_float(vals0 + 1, vals0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_int(keys0 + 3, keys0 + 4);
swap_float(vals0 + 3, vals0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_int(keys0 + 5, keys0 + 6);
swap_float(vals0 + 5, vals0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_int(keys0 + 7, keys0 + 8);
swap_float(vals0 + 7, vals0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_int(keys0 + 9, keys0 + 10);
swap_float(vals0 + 9, vals0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_int(keys0 + 1, keys0 + 2);
swap_float(vals0 + 1, vals0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_int(keys0 + 3, keys0 + 4);
swap_float(vals0 + 3, vals0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_int(keys0 + 5, keys0 + 6);
swap_float(vals0 + 5, vals0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_int(keys0 + 7, keys0 + 8);
swap_float(vals0 + 7, vals0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_int(keys0 + 9, keys0 + 10);
swap_float(vals0 + 9, vals0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_int(keys0 + 1, keys0 + 2);
swap_float(vals0 + 1, vals0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_int(keys0 + 3, keys0 + 4);
swap_float(vals0 + 3, vals0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_int(keys0 + 5, keys0 + 6);
swap_float(vals0 + 5, vals0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_int(keys0 + 7, keys0 + 8);
swap_float(vals0 + 7, vals0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_int(keys0 + 9, keys0 + 10);
swap_float(vals0 + 9, vals0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
}
int merge_path_int
(
int a_count,
int b_count,
int diag,
local const int * a0,
local const int * b0
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
void block_sort_pass_1_11_int
(
int tid,
int count,
int coop,
int * indices,
local const int * keys_shared0,
int * keys0
)
{
int list = ~(coop - 1) & tid;
int diag = min(count, 11 * ((coop - 1) & tid));
int start = 11 * list;
int a0 = min(count, start);
int b0 = min(count, start + 11 * (coop / 2));
int b1 = min(count, start + 11 * coop);
int p = merge_path_int(b0 - a0, b1 - b0, diag, keys_shared0 + a0, keys_shared0 + b0);
serial_merge_11_int(a0 + p, b0, b0 + diag - p, b1, indices, keys_shared0, keys0);
}
void gather_1_11_float
(
const int * indices,
int tid,
local const float * data0,
float * reg0
)
{
reg0[0] = data0[indices[0]];
reg0[1] = data0[indices[1]];
reg0[2] = data0[indices[2]];
reg0[3] = data0[indices[3]];
reg0[4] = data0[indices[4]];
reg0[5] = data0[indices[5]];
reg0[6] = data0[indices[6]];
reg0[7] = data0[indices[7]];
reg0[8] = data0[indices[8]];
reg0[9] = data0[indices[9]];
reg0[10] = data0[indices[10]];
barrier(CLK_LOCAL_MEM_FENCE);
}
void block_sort_loop_1_11_int_float
(
int tid,
int count,
local int * keys_shared0,
float * thread_vals0,
local float * vals_shared0
)
{
int indices[11];
int keys0[11];
}
void mergesort_1_11_int_float
(
int count,
int tid,
int * thread_keys0,
local int * keys_shared0,
float * thread_vals0,
local float * vals_shared0
)
{
if(11 * tid < count) odd_even_transpose_sort_11_int_float(thread_keys0, thread_vals0);
thread_to_shared_11_int(thread_keys0, tid, keys_shared0);
block_sort_loop_1_11_int_float(tid, count, keys_shared0, thread_vals0, vals_shared0);
}
kernel void block_sort
(
int count,
global const int * keys_src0,
global int * keys_dst0,
global const float * vals_src0,
global float * vals_dst0
)
{
union Shared
{
struct
{
int keys0[12];
};
struct
{
float vals0[11];
};
};
local union Shared shared;
int tid = get_local_id(0);
int block = get_group_id(0);
int gid = 11 * block;
int count2 = min(11, count - gid);
float thread_vals0[11];
global_to_shared_1_11_float(count2, vals_src0 + gid, tid, shared.vals0);
shared_to_thread_11_float(shared.vals0, tid, thread_vals0);
int thread_keys0[11];
global_to_shared_1_11_int(count2, keys_src0 + gid, tid, shared.keys0);
shared_to_thread_11_int(shared.keys0, tid, thread_keys0);
int first = 11 * tid;
if(first + 11 > count2 && first < count2)
{
int max_key0 = thread_keys0[0];
if(first + 1 < count2 && comp(max_key0, thread_keys0[1]) )
{
max_key0 = thread_keys0[1];
}
if(first + 2 < count2 && comp(max_key0, thread_keys0[2]) )
{
max_key0 = thread_keys0[2];
}
if(first + 3 < count2 && comp(max_key0, thread_keys0[3]) )
{
max_key0 = thread_keys0[3];
}
if(first + 4 < count2 && comp(max_key0, thread_keys0[4]) )
{
max_key0 = thread_keys0[4];
}
if(first + 5 < count2 && comp(max_key0, thread_keys0[5]) )
{
max_key0 = thread_keys0[5];
}
if(first + 6 < count2 && comp(max_key0, thread_keys0[6]) )
{
max_key0 = thread_keys0[6];
}
if(first + 7 < count2 && comp(max_key0, thread_keys0[7]) )
{
max_key0 = thread_keys0[7];
}
if(first + 8 < count2 && comp(max_key0, thread_keys0[8]) )
{
max_key0 = thread_keys0[8];
}
if(first + 9 < count2 && comp(max_key0, thread_keys0[9]) )
{
max_key0 = thread_keys0[9];
}
if(first + 10 < count2 && comp(max_key0, thread_keys0[10]) )
{
max_key0 = thread_keys0[10];
}
if(first + 0 >= count2)
{
thread_keys0[0] = max_key0;
}
if(first + 1 >= count2)
{
thread_keys0[1] = max_key0;
}
if(first + 2 >= count2)
{
thread_keys0[2] = max_key0;
}
if(first + 3 >= count2)
{
thread_keys0[3] = max_key0;
}
if(first + 4 >= count2)
{
thread_keys0[4] = max_key0;
}
if(first + 5 >= count2)
{
thread_keys0[5] = max_key0;
}
if(first + 6 >= count2)
{
thread_keys0[6] = max_key0;
}
if(first + 7 >= count2)
{
thread_keys0[7] = max_key0;
}
if(first + 8 >= count2)
{
thread_keys0[8] = max_key0;
}
if(first + 9 >= count2)
{
thread_keys0[9] = max_key0;
}
if(first + 10 >= count2)
{
thread_keys0[10] = max_key0;
}
}
mergesort_1_11_int_float(count2, tid, thread_keys0, shared.keys0, thread_vals0, shared.vals0);
shared_to_global_1_11_int(count2, shared.keys0, tid, keys_dst0 + gid);
thread_to_shared_11_float(thread_vals0, tid, shared.vals0);
shared_to_global_1_11_float(count2, shared.vals0, tid, vals_dst0 + gid);
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int x,
int y
)
{
return x < y;
}
int4 find_mergesort_frame
(
int coop,
int block,
int nv
)
{
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
int4 frame;
frame.x = nv * start;
frame.y = nv * start + size;
frame.z = size;
return frame;
}
int4 find_mergesort_interval
(
int4 frame,
int coop,
int block,
int nv,
int count,
int mp0,
int mp1
)
{
int diag = nv * block - frame.x;
int4 interval;
interval.x = frame.x + mp0;
interval.y = min(count, frame.x + mp1);
interval.z = min(count, frame.y + diag - mp0);
interval.w = min(count, frame.y + diag + nv - mp1);
if(coop - 1 == ((coop - 1) & block))
{
interval.y = min(count, frame.x + frame.z);
interval.w = min(count, frame.y + frame.z);
}
return interval;
}
int4 compute_merge_range
(
int a_count,
int b_count,
int block,
int coop,
int nv,
global const int * mp_global
)
{
int mp0 = mp_global[block];
int mp1 = mp_global[block + 1];
int gid = nv * block;
int4 range;
if(coop)
{
int4 frame = find_mergesort_frame(coop, block, nv);
range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1);
}
else
{
range.x = mp0;
range.y = mp1;
range.z = gid - range.x;
range.w = min(a_count + b_count, gid + nv) - range.y;
}
return range;
}
void global_to_regstr_pred_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_float(count, data, tid, reg);
}
void regstr_to_global_1_11_float
(
int count,
const float * reg,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_float
(
int count,
global const float * source,
int tid,
local float * dest
)
{
float reg[11];
global_to_regstr_1_11_float(count, source, tid, reg);
regstr_to_shared_1_11_float(reg, tid, dest);
}
void shared_to_global_1_11_float
(
int count,
local const float * source,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_int(count, data, tid, reg);
}
void regstr_to_global_1_11_int
(
int count,
const int * reg,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_int
(
int count,
global const int * source,
int tid,
local int * dest
)
{
int reg[11];
global_to_regstr_1_11_int(count, source, tid, reg);
regstr_to_shared_1_11_int(reg, tid, dest);
}
void shared_to_global_1_11_int
(
int count,
local const int * source,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void serial_merge_11_int
(
int a_begin,
int a_end,
int b_begin,
int b_end,
int * indices,
local const int * keys_shared0,
int * results0
)
{
int a_key0 = keys_shared0[a_begin];
int b_key0 = keys_shared0[b_begin];
bool p;
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[0] = p ? a_key0 : b_key0;
indices[0] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[1] = p ? a_key0 : b_key0;
indices[1] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[2] = p ? a_key0 : b_key0;
indices[2] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[3] = p ? a_key0 : b_key0;
indices[3] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[4] = p ? a_key0 : b_key0;
indices[4] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[5] = p ? a_key0 : b_key0;
indices[5] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[6] = p ? a_key0 : b_key0;
indices[6] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[7] = p ? a_key0 : b_key0;
indices[7] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[8] = p ? a_key0 : b_key0;
indices[8] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[9] = p ? a_key0 : b_key0;
indices[9] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[10] = p ? a_key0 : b_key0;
indices[10] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
int merge_path_int
(
int a_count,
int b_count,
int diag,
local const int * a0,
local const int * b0
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
void load2_to_regstr_1_11_11_int
(
global const int * a_global,
int a_count,
global const int * b_global,
int b_count,
int tid,
int * reg
)
{
b_global -= a_count;
int total = a_count + b_count;
int index;
if (total >= 11)
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else reg[6] = b_global[index];
index = 7 + tid;
if (index < a_count) reg[7] = a_global[index];
else reg[7] = b_global[index];
index = 8 + tid;
if (index < a_count) reg[8] = a_global[index];
else reg[8] = b_global[index];
index = 9 + tid;
if (index < a_count) reg[9] = a_global[index];
else reg[9] = b_global[index];
index = 10 + tid;
if (index < a_count) reg[10] = a_global[index];
else reg[10] = b_global[index];
}
else
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else if (index < total) reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else if (index < total) reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else if (index < total) reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else if (index < total) reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else if (index < total) reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else if (index < total) reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else if (index < total) reg[6] = b_global[index];
index = 7 + tid;
if (index < a_count) reg[7] = a_global[index];
else if (index < total) reg[7] = b_global[index];
index = 8 + tid;
if (index < a_count) reg[8] = a_global[index];
else if (index < total) reg[8] = b_global[index];
index = 9 + tid;
if (index < a_count) reg[9] = a_global[index];
else if (index < total) reg[9] = b_global[index];
index = 10 + tid;
if (index < a_count) reg[10] = a_global[index];
else if (index < total) reg[10] = b_global[index];
}
}
void load2_to_shared_1_11_11_int
(
global const int * a_global,
int a_count,
global const int * b_global,
int b_count,
int tid,
local int * shared
)
{
int reg[11];
load2_to_regstr_1_11_11_int(a_global, a_count, b_global, b_count, tid, reg);
regstr_to_shared_1_11_int(reg, tid, shared);
}
void merge_keys_indices_1_11_int
(
int a_count,
int b_count,
int4 range,
int tid,
int * indices,
global const int * a_global0,
global const int * b_global0,
local int * keys_shared0,
int * results0
)
{
int a0 = range.x;
int a1 = range.y;
int b0 = range.z;
int b1 = range.w;
a_count = a1 - a0;
b_count = b1 - b0;
load2_to_shared_1_11_11_int(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0);
int diag = 11 * tid;
int mp = merge_path_int(a_count, b_count, diag, keys_shared0, keys_shared0 + a_count);
int a0tid = mp;
int a1tid = a_count;
int b0tid = a_count + diag - mp;
int b1tid = a_count + b_count;
serial_merge_11_int(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, results0);
}
void transfer_merge_values_regstr_1_11_float
(
int count,
int b_start,
const int * indices,
int tid,
global const float * a_global0,
global const float * b_global0,
float * reg0
)
{
b_global0 -= b_start;
if(count >= 11)
{
if (indices[0] < b_start)
{
reg0[0] = a_global0[indices[0]];
}
else
{
reg0[0] = b_global0[indices[0]];
}
if (indices[1] < b_start)
{
reg0[1] = a_global0[indices[1]];
}
else
{
reg0[1] = b_global0[indices[1]];
}
if (indices[2] < b_start)
{
reg0[2] = a_global0[indices[2]];
}
else
{
reg0[2] = b_global0[indices[2]];
}
if (indices[3] < b_start)
{
reg0[3] = a_global0[indices[3]];
}
else
{
reg0[3] = b_global0[indices[3]];
}
if (indices[4] < b_start)
{
reg0[4] = a_global0[indices[4]];
}
else
{
reg0[4] = b_global0[indices[4]];
}
if (indices[5] < b_start)
{
reg0[5] = a_global0[indices[5]];
}
else
{
reg0[5] = b_global0[indices[5]];
}
if (indices[6] < b_start)
{
reg0[6] = a_global0[indices[6]];
}
else
{
reg0[6] = b_global0[indices[6]];
}
if (indices[7] < b_start)
{
reg0[7] = a_global0[indices[7]];
}
else
{
reg0[7] = b_global0[indices[7]];
}
if (indices[8] < b_start)
{
reg0[8] = a_global0[indices[8]];
}
else
{
reg0[8] = b_global0[indices[8]];
}
if (indices[9] < b_start)
{
reg0[9] = a_global0[indices[9]];
}
else
{
reg0[9] = b_global0[indices[9]];
}
if (indices[10] < b_start)
{
reg0[10] = a_global0[indices[10]];
}
else
{
reg0[10] = b_global0[indices[10]];
}
}
else
{
int index;
index = 0 + tid;
if(index < count)
{
if (indices[0] < b_start)
{
reg0[0] = a_global0[indices[0]];
}
else
{
reg0[0] = b_global0[indices[0]];
}
}
index = 1 + tid;
if(index < count)
{
if (indices[1] < b_start)
{
reg0[1] = a_global0[indices[1]];
}
else
{
reg0[1] = b_global0[indices[1]];
}
}
index = 2 + tid;
if(index < count)
{
if (indices[2] < b_start)
{
reg0[2] = a_global0[indices[2]];
}
else
{
reg0[2] = b_global0[indices[2]];
}
}
index = 3 + tid;
if(index < count)
{
if (indices[3] < b_start)
{
reg0[3] = a_global0[indices[3]];
}
else
{
reg0[3] = b_global0[indices[3]];
}
}
index = 4 + tid;
if(index < count)
{
if (indices[4] < b_start)
{
reg0[4] = a_global0[indices[4]];
}
else
{
reg0[4] = b_global0[indices[4]];
}
}
index = 5 + tid;
if(index < count)
{
if (indices[5] < b_start)
{
reg0[5] = a_global0[indices[5]];
}
else
{
reg0[5] = b_global0[indices[5]];
}
}
index = 6 + tid;
if(index < count)
{
if (indices[6] < b_start)
{
reg0[6] = a_global0[indices[6]];
}
else
{
reg0[6] = b_global0[indices[6]];
}
}
index = 7 + tid;
if(index < count)
{
if (indices[7] < b_start)
{
reg0[7] = a_global0[indices[7]];
}
else
{
reg0[7] = b_global0[indices[7]];
}
}
index = 8 + tid;
if(index < count)
{
if (indices[8] < b_start)
{
reg0[8] = a_global0[indices[8]];
}
else
{
reg0[8] = b_global0[indices[8]];
}
}
index = 9 + tid;
if(index < count)
{
if (indices[9] < b_start)
{
reg0[9] = a_global0[indices[9]];
}
else
{
reg0[9] = b_global0[indices[9]];
}
}
index = 10 + tid;
if(index < count)
{
if (indices[10] < b_start)
{
reg0[10] = a_global0[indices[10]];
}
else
{
reg0[10] = b_global0[indices[10]];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void transfer_merge_values_shared_1_11_float
(
int count,
int b_start,
local const int * indices_shared,
int tid,
global const float * a_global0,
global const float * b_global0,
global float * dest_global0
)
{
int indices[11];
shared_to_regstr_1_11_int(indices_shared, tid, indices);
float reg0[11];
transfer_merge_values_regstr_1_11_float(count, b_start, indices, tid, a_global0, b_global0, reg0);
regstr_to_global_1_11_float(count, reg0, tid, dest_global0);
}
void device_merge_1_11_int_float
(
int a_count,
int b_count,
global const int * a_keys_global0,
global const int * b_keys_global0,
global int * keys_global0,
local int * keys_shared0,
global const float * a_vals_global0,
global const float * b_vals_global0,
global float * vals_global0,
int tid,
int block,
int4 range,
local int * indices_shared
)
{
int results0[11];
int indices[11];
merge_keys_indices_1_11_int(a_count, b_count, range, tid, indices, a_keys_global0, b_keys_global0, keys_shared0, results0);
thread_to_shared_11_int(results0, tid, keys_shared0);
a_count = range.y - range.x;
b_count = range.w - range.z;
shared_to_global_1_11_int(a_count + b_count, keys_shared0, tid, keys_global0 + 11 * block);
thread_to_shared_11_int(indices, tid, indices_shared);
transfer_merge_values_shared_1_11_float(a_count + b_count, a_count, indices_shared, tid, a_vals_global0 + range.x, b_vals_global0 + range.z, vals_global0 + 11 * block);
}
kernel void merge
(
int a_count,
int b_count,
global const int * a_keys_global0,
global const int * b_keys_global0,
global int * keys_global0,
global const float * a_vals_global0,
global const float * b_vals_global0,
global float * vals_global0,
global const int * mp_global,
int coop
)
{
union Shared
{
struct
{
int keys0[12];
};
int indices[11];
};
local union Shared shared;
int tid = get_local_id(0);
int block = get_group_id(0);
int4 range = compute_merge_range(a_count, b_count, block, coop, 11, mp_global);
device_merge_1_11_int_float(a_count, b_count, a_keys_global0, b_keys_global0, keys_global0, shared.keys0, a_vals_global0, b_vals_global0, vals_global0, tid, block, range, shared.indices);
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int x,
int y
)
{
return x < y;
}
int merge_path_int
(
int a_count,
int b_count,
int diag,
global const int * a0,
global const int * b0
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
int4 find_mergesort_frame
(
int coop,
int block,
int nv
)
{
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
int4 frame;
frame.x = nv * start;
frame.y = nv * start + size;
frame.z = size;
return frame;
}
kernel void merge_partition
(
int a_count,
int b_count,
int nv,
int coop,
global int * mp_global,
int num_searches,
global const int * a_global0,
global const int * b_global0
)
{
int partition = get_global_id(0);
if (partition < num_searches)
{
int a0 = 0, b0 = 0;
int gid = nv * partition;
if(coop)
{
int4 frame = find_mergesort_frame(coop, partition, nv);
a0 = frame.x;
b0 = min(a_count, frame.y);
b_count = min(a_count, frame.y + frame.z) - b0;
a_count = min(a_count, frame.x + frame.z) - a0;
gid -= a0;
}
int mp = merge_path_int(a_count, b_count, min(gid, a_count + b_count), a_global0 + a0, b_global0 + b0);
mp_global[partition] = mp;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a,
int b
)
{
char bit1 = 1 & a; char bit2 = 1 & b; if (bit1 == bit2) return a < b; return bit1 < bit2;
}
void global_to_regstr_pred_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_float(count, data, tid, reg);
}
void regstr_to_global_1_11_float
(
int count,
const float * reg,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_float
(
int count,
global const float * source,
int tid,
local float * dest
)
{
float reg[11];
global_to_regstr_1_11_float(count, source, tid, reg);
regstr_to_shared_1_11_float(reg, tid, dest);
}
void shared_to_global_1_11_float
(
int count,
local const float * source,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_int(count, data, tid, reg);
}
void regstr_to_global_1_11_int
(
int count,
const int * reg,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_int
(
int count,
global const int * source,
int tid,
local int * dest
)
{
int reg[11];
global_to_regstr_1_11_int(count, source, tid, reg);
regstr_to_shared_1_11_int(reg, tid, dest);
}
void shared_to_global_1_11_int
(
int count,
local const int * source,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void serial_merge_11_int
(
int a_begin,
int a_end,
int b_begin,
int b_end,
int * indices,
local const int * keys_shared0,
int * results0
)
{
int a_key0 = keys_shared0[a_begin];
int b_key0 = keys_shared0[b_begin];
bool p;
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[0] = p ? a_key0 : b_key0;
indices[0] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[1] = p ? a_key0 : b_key0;
indices[1] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[2] = p ? a_key0 : b_key0;
indices[2] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[3] = p ? a_key0 : b_key0;
indices[3] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[4] = p ? a_key0 : b_key0;
indices[4] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[5] = p ? a_key0 : b_key0;
indices[5] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[6] = p ? a_key0 : b_key0;
indices[6] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[7] = p ? a_key0 : b_key0;
indices[7] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[8] = p ? a_key0 : b_key0;
indices[8] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[9] = p ? a_key0 : b_key0;
indices[9] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[10] = p ? a_key0 : b_key0;
indices[10] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void swap_int
(
int * a0,
int * b0
)
{
{
int c = *a0;
*a0 = *b0;
*b0 = c;
}
}
void swap_float
(
float * a0,
float * b0
)
{
{
float c = *a0;
*a0 = *b0;
*b0 = c;
}
}
void odd_even_transpose_sort_11_int_float
(
int * keys0,
float * vals0
)
{
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_int(keys0 + 1, keys0 + 2);
swap_float(vals0 + 1, vals0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_int(keys0 + 3, keys0 + 4);
swap_float(vals0 + 3, vals0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_int(keys0 + 5, keys0 + 6);
swap_float(vals0 + 5, vals0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_int(keys0 + 7, keys0 + 8);
swap_float(vals0 + 7, vals0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_int(keys0 + 9, keys0 + 10);
swap_float(vals0 + 9, vals0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_int(keys0 + 1, keys0 + 2);
swap_float(vals0 + 1, vals0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_int(keys0 + 3, keys0 + 4);
swap_float(vals0 + 3, vals0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_int(keys0 + 5, keys0 + 6);
swap_float(vals0 + 5, vals0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_int(keys0 + 7, keys0 + 8);
swap_float(vals0 + 7, vals0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_int(keys0 + 9, keys0 + 10);
swap_float(vals0 + 9, vals0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_int(keys0 + 1, keys0 + 2);
swap_float(vals0 + 1, vals0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_int(keys0 + 3, keys0 + 4);
swap_float(vals0 + 3, vals0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_int(keys0 + 5, keys0 + 6);
swap_float(vals0 + 5, vals0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_int(keys0 + 7, keys0 + 8);
swap_float(vals0 + 7, vals0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_int(keys0 + 9, keys0 + 10);
swap_float(vals0 + 9, vals0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_int(keys0 + 1, keys0 + 2);
swap_float(vals0 + 1, vals0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_int(keys0 + 3, keys0 + 4);
swap_float(vals0 + 3, vals0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_int(keys0 + 5, keys0 + 6);
swap_float(vals0 + 5, vals0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_int(keys0 + 7, keys0 + 8);
swap_float(vals0 + 7, vals0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_int(keys0 + 9, keys0 + 10);
swap_float(vals0 + 9, vals0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
if (comp(keys0[2], keys0[1]))
{
swap_int(keys0 + 1, keys0 + 2);
swap_float(vals0 + 1, vals0 + 2);
}
if (comp(keys0[4], keys0[3]))
{
swap_int(keys0 + 3, keys0 + 4);
swap_float(vals0 + 3, vals0 + 4);
}
if (comp(keys0[6], keys0[5]))
{
swap_int(keys0 + 5, keys0 + 6);
swap_float(vals0 + 5, vals0 + 6);
}
if (comp(keys0[8], keys0[7]))
{
swap_int(keys0 + 7, keys0 + 8);
swap_float(vals0 + 7, vals0 + 8);
}
if (comp(keys0[10], keys0[9]))
{
swap_int(keys0 + 9, keys0 + 10);
swap_float(vals0 + 9, vals0 + 10);
}
if (comp(keys0[1], keys0[0]))
{
swap_int(keys0 + 0, keys0 + 1);
swap_float(vals0 + 0, vals0 + 1);
}
if (comp(keys0[3], keys0[2]))
{
swap_int(keys0 + 2, keys0 + 3);
swap_float(vals0 + 2, vals0 + 3);
}
if (comp(keys0[5], keys0[4]))
{
swap_int(keys0 + 4, keys0 + 5);
swap_float(vals0 + 4, vals0 + 5);
}
if (comp(keys0[7], keys0[6]))
{
swap_int(keys0 + 6, keys0 + 7);
swap_float(vals0 + 6, vals0 + 7);
}
if (comp(keys0[9], keys0[8]))
{
swap_int(keys0 + 8, keys0 + 9);
swap_float(vals0 + 8, vals0 + 9);
}
}
int merge_path_int
(
int a_count,
int b_count,
int diag,
local const int * a0,
local const int * b0
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
void block_sort_pass_1_11_int
(
int tid,
int count,
int coop,
int * indices,
local const int * keys_shared0,
int * keys0
)
{
int list = ~(coop - 1) & tid;
int diag = min(count, 11 * ((coop - 1) & tid));
int start = 11 * list;
int a0 = min(count, start);
int b0 = min(count, start + 11 * (coop / 2));
int b1 = min(count, start + 11 * coop);
int p = merge_path_int(b0 - a0, b1 - b0, diag, keys_shared0 + a0, keys_shared0 + b0);
serial_merge_11_int(a0 + p, b0, b0 + diag - p, b1, indices, keys_shared0, keys0);
}
void gather_1_11_float
(
const int * indices,
int tid,
local const float * data0,
float * reg0
)
{
reg0[0] = data0[indices[0]];
reg0[1] = data0[indices[1]];
reg0[2] = data0[indices[2]];
reg0[3] = data0[indices[3]];
reg0[4] = data0[indices[4]];
reg0[5] = data0[indices[5]];
reg0[6] = data0[indices[6]];
reg0[7] = data0[indices[7]];
reg0[8] = data0[indices[8]];
reg0[9] = data0[indices[9]];
reg0[10] = data0[indices[10]];
barrier(CLK_LOCAL_MEM_FENCE);
}
void block_sort_loop_1_11_int_float
(
int tid,
int count,
local int * keys_shared0,
float * thread_vals0,
local float * vals_shared0
)
{
int indices[11];
int keys0[11];
}
void mergesort_1_11_int_float
(
int count,
int tid,
int * thread_keys0,
local int * keys_shared0,
float * thread_vals0,
local float * vals_shared0
)
{
if(11 * tid < count) odd_even_transpose_sort_11_int_float(thread_keys0, thread_vals0);
thread_to_shared_11_int(thread_keys0, tid, keys_shared0);
block_sort_loop_1_11_int_float(tid, count, keys_shared0, thread_vals0, vals_shared0);
}
kernel void block_sort
(
int count,
global const int * keys_src0,
global int * keys_dst0,
global const float * vals_src0,
global float * vals_dst0
)
{
union Shared
{
struct
{
int keys0[12];
};
struct
{
float vals0[11];
};
};
local union Shared shared;
int tid = get_local_id(0);
int block = get_group_id(0);
int gid = 11 * block;
int count2 = min(11, count - gid);
float thread_vals0[11];
global_to_shared_1_11_float(count2, vals_src0 + gid, tid, shared.vals0);
shared_to_thread_11_float(shared.vals0, tid, thread_vals0);
int thread_keys0[11];
global_to_shared_1_11_int(count2, keys_src0 + gid, tid, shared.keys0);
shared_to_thread_11_int(shared.keys0, tid, thread_keys0);
int first = 11 * tid;
if(first + 11 > count2 && first < count2)
{
int max_key0 = thread_keys0[0];
if(first + 1 < count2 && comp(max_key0, thread_keys0[1]) )
{
max_key0 = thread_keys0[1];
}
if(first + 2 < count2 && comp(max_key0, thread_keys0[2]) )
{
max_key0 = thread_keys0[2];
}
if(first + 3 < count2 && comp(max_key0, thread_keys0[3]) )
{
max_key0 = thread_keys0[3];
}
if(first + 4 < count2 && comp(max_key0, thread_keys0[4]) )
{
max_key0 = thread_keys0[4];
}
if(first + 5 < count2 && comp(max_key0, thread_keys0[5]) )
{
max_key0 = thread_keys0[5];
}
if(first + 6 < count2 && comp(max_key0, thread_keys0[6]) )
{
max_key0 = thread_keys0[6];
}
if(first + 7 < count2 && comp(max_key0, thread_keys0[7]) )
{
max_key0 = thread_keys0[7];
}
if(first + 8 < count2 && comp(max_key0, thread_keys0[8]) )
{
max_key0 = thread_keys0[8];
}
if(first + 9 < count2 && comp(max_key0, thread_keys0[9]) )
{
max_key0 = thread_keys0[9];
}
if(first + 10 < count2 && comp(max_key0, thread_keys0[10]) )
{
max_key0 = thread_keys0[10];
}
if(first + 0 >= count2)
{
thread_keys0[0] = max_key0;
}
if(first + 1 >= count2)
{
thread_keys0[1] = max_key0;
}
if(first + 2 >= count2)
{
thread_keys0[2] = max_key0;
}
if(first + 3 >= count2)
{
thread_keys0[3] = max_key0;
}
if(first + 4 >= count2)
{
thread_keys0[4] = max_key0;
}
if(first + 5 >= count2)
{
thread_keys0[5] = max_key0;
}
if(first + 6 >= count2)
{
thread_keys0[6] = max_key0;
}
if(first + 7 >= count2)
{
thread_keys0[7] = max_key0;
}
if(first + 8 >= count2)
{
thread_keys0[8] = max_key0;
}
if(first + 9 >= count2)
{
thread_keys0[9] = max_key0;
}
if(first + 10 >= count2)
{
thread_keys0[10] = max_key0;
}
}
mergesort_1_11_int_float(count2, tid, thread_keys0, shared.keys0, thread_vals0, shared.vals0);
shared_to_global_1_11_int(count2, shared.keys0, tid, keys_dst0 + gid);
thread_to_shared_11_float(thread_vals0, tid, shared.vals0);
shared_to_global_1_11_float(count2, shared.vals0, tid, vals_dst0 + gid);
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a,
int b
)
{
char bit1 = 1 & a; char bit2 = 1 & b; if (bit1 == bit2) return a < b; return bit1 < bit2;
}
int4 find_mergesort_frame
(
int coop,
int block,
int nv
)
{
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
int4 frame;
frame.x = nv * start;
frame.y = nv * start + size;
frame.z = size;
return frame;
}
int4 find_mergesort_interval
(
int4 frame,
int coop,
int block,
int nv,
int count,
int mp0,
int mp1
)
{
int diag = nv * block - frame.x;
int4 interval;
interval.x = frame.x + mp0;
interval.y = min(count, frame.x + mp1);
interval.z = min(count, frame.y + diag - mp0);
interval.w = min(count, frame.y + diag + nv - mp1);
if(coop - 1 == ((coop - 1) & block))
{
interval.y = min(count, frame.x + frame.z);
interval.w = min(count, frame.y + frame.z);
}
return interval;
}
int4 compute_merge_range
(
int a_count,
int b_count,
int block,
int coop,
int nv,
global const int * mp_global
)
{
int mp0 = mp_global[block];
int mp1 = mp_global[block + 1];
int gid = nv * block;
int4 range;
if(coop)
{
int4 frame = find_mergesort_frame(coop, block, nv);
range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1);
}
else
{
range.x = mp0;
range.y = mp1;
range.z = gid - range.x;
range.w = min(a_count + b_count, gid + nv) - range.y;
}
return range;
}
void global_to_regstr_pred_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_float(count, data, tid, reg);
}
void regstr_to_global_1_11_float
(
int count,
const float * reg,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_float
(
int count,
global const float * source,
int tid,
local float * dest
)
{
float reg[11];
global_to_regstr_1_11_float(count, source, tid, reg);
regstr_to_shared_1_11_float(reg, tid, dest);
}
void shared_to_global_1_11_float
(
int count,
local const float * source,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_int(count, data, tid, reg);
}
void regstr_to_global_1_11_int
(
int count,
const int * reg,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_int
(
int count,
global const int * source,
int tid,
local int * dest
)
{
int reg[11];
global_to_regstr_1_11_int(count, source, tid, reg);
regstr_to_shared_1_11_int(reg, tid, dest);
}
void shared_to_global_1_11_int
(
int count,
local const int * source,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void serial_merge_11_int
(
int a_begin,
int a_end,
int b_begin,
int b_end,
int * indices,
local const int * keys_shared0,
int * results0
)
{
int a_key0 = keys_shared0[a_begin];
int b_key0 = keys_shared0[b_begin];
bool p;
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[0] = p ? a_key0 : b_key0;
indices[0] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[1] = p ? a_key0 : b_key0;
indices[1] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[2] = p ? a_key0 : b_key0;
indices[2] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[3] = p ? a_key0 : b_key0;
indices[3] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[4] = p ? a_key0 : b_key0;
indices[4] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[5] = p ? a_key0 : b_key0;
indices[5] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[6] = p ? a_key0 : b_key0;
indices[6] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[7] = p ? a_key0 : b_key0;
indices[7] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[8] = p ? a_key0 : b_key0;
indices[8] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[9] = p ? a_key0 : b_key0;
indices[9] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0));
results0[10] = p ? a_key0 : b_key0;
indices[10] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
int merge_path_int
(
int a_count,
int b_count,
int diag,
local const int * a0,
local const int * b0
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
void load2_to_regstr_1_11_11_int
(
global const int * a_global,
int a_count,
global const int * b_global,
int b_count,
int tid,
int * reg
)
{
b_global -= a_count;
int total = a_count + b_count;
int index;
if (total >= 11)
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else reg[6] = b_global[index];
index = 7 + tid;
if (index < a_count) reg[7] = a_global[index];
else reg[7] = b_global[index];
index = 8 + tid;
if (index < a_count) reg[8] = a_global[index];
else reg[8] = b_global[index];
index = 9 + tid;
if (index < a_count) reg[9] = a_global[index];
else reg[9] = b_global[index];
index = 10 + tid;
if (index < a_count) reg[10] = a_global[index];
else reg[10] = b_global[index];
}
else
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else if (index < total) reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else if (index < total) reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else if (index < total) reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else if (index < total) reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else if (index < total) reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else if (index < total) reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else if (index < total) reg[6] = b_global[index];
index = 7 + tid;
if (index < a_count) reg[7] = a_global[index];
else if (index < total) reg[7] = b_global[index];
index = 8 + tid;
if (index < a_count) reg[8] = a_global[index];
else if (index < total) reg[8] = b_global[index];
index = 9 + tid;
if (index < a_count) reg[9] = a_global[index];
else if (index < total) reg[9] = b_global[index];
index = 10 + tid;
if (index < a_count) reg[10] = a_global[index];
else if (index < total) reg[10] = b_global[index];
}
}
void load2_to_shared_1_11_11_int
(
global const int * a_global,
int a_count,
global const int * b_global,
int b_count,
int tid,
local int * shared
)
{
int reg[11];
load2_to_regstr_1_11_11_int(a_global, a_count, b_global, b_count, tid, reg);
regstr_to_shared_1_11_int(reg, tid, shared);
}
void merge_keys_indices_1_11_int
(
int a_count,
int b_count,
int4 range,
int tid,
int * indices,
global const int * a_global0,
global const int * b_global0,
local int * keys_shared0,
int * results0
)
{
int a0 = range.x;
int a1 = range.y;
int b0 = range.z;
int b1 = range.w;
a_count = a1 - a0;
b_count = b1 - b0;
load2_to_shared_1_11_11_int(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0);
int diag = 11 * tid;
int mp = merge_path_int(a_count, b_count, diag, keys_shared0, keys_shared0 + a_count);
int a0tid = mp;
int a1tid = a_count;
int b0tid = a_count + diag - mp;
int b1tid = a_count + b_count;
serial_merge_11_int(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, results0);
}
void transfer_merge_values_regstr_1_11_float
(
int count,
int b_start,
const int * indices,
int tid,
global const float * a_global0,
global const float * b_global0,
float * reg0
)
{
b_global0 -= b_start;
if(count >= 11)
{
if (indices[0] < b_start)
{
reg0[0] = a_global0[indices[0]];
}
else
{
reg0[0] = b_global0[indices[0]];
}
if (indices[1] < b_start)
{
reg0[1] = a_global0[indices[1]];
}
else
{
reg0[1] = b_global0[indices[1]];
}
if (indices[2] < b_start)
{
reg0[2] = a_global0[indices[2]];
}
else
{
reg0[2] = b_global0[indices[2]];
}
if (indices[3] < b_start)
{
reg0[3] = a_global0[indices[3]];
}
else
{
reg0[3] = b_global0[indices[3]];
}
if (indices[4] < b_start)
{
reg0[4] = a_global0[indices[4]];
}
else
{
reg0[4] = b_global0[indices[4]];
}
if (indices[5] < b_start)
{
reg0[5] = a_global0[indices[5]];
}
else
{
reg0[5] = b_global0[indices[5]];
}
if (indices[6] < b_start)
{
reg0[6] = a_global0[indices[6]];
}
else
{
reg0[6] = b_global0[indices[6]];
}
if (indices[7] < b_start)
{
reg0[7] = a_global0[indices[7]];
}
else
{
reg0[7] = b_global0[indices[7]];
}
if (indices[8] < b_start)
{
reg0[8] = a_global0[indices[8]];
}
else
{
reg0[8] = b_global0[indices[8]];
}
if (indices[9] < b_start)
{
reg0[9] = a_global0[indices[9]];
}
else
{
reg0[9] = b_global0[indices[9]];
}
if (indices[10] < b_start)
{
reg0[10] = a_global0[indices[10]];
}
else
{
reg0[10] = b_global0[indices[10]];
}
}
else
{
int index;
index = 0 + tid;
if(index < count)
{
if (indices[0] < b_start)
{
reg0[0] = a_global0[indices[0]];
}
else
{
reg0[0] = b_global0[indices[0]];
}
}
index = 1 + tid;
if(index < count)
{
if (indices[1] < b_start)
{
reg0[1] = a_global0[indices[1]];
}
else
{
reg0[1] = b_global0[indices[1]];
}
}
index = 2 + tid;
if(index < count)
{
if (indices[2] < b_start)
{
reg0[2] = a_global0[indices[2]];
}
else
{
reg0[2] = b_global0[indices[2]];
}
}
index = 3 + tid;
if(index < count)
{
if (indices[3] < b_start)
{
reg0[3] = a_global0[indices[3]];
}
else
{
reg0[3] = b_global0[indices[3]];
}
}
index = 4 + tid;
if(index < count)
{
if (indices[4] < b_start)
{
reg0[4] = a_global0[indices[4]];
}
else
{
reg0[4] = b_global0[indices[4]];
}
}
index = 5 + tid;
if(index < count)
{
if (indices[5] < b_start)
{
reg0[5] = a_global0[indices[5]];
}
else
{
reg0[5] = b_global0[indices[5]];
}
}
index = 6 + tid;
if(index < count)
{
if (indices[6] < b_start)
{
reg0[6] = a_global0[indices[6]];
}
else
{
reg0[6] = b_global0[indices[6]];
}
}
index = 7 + tid;
if(index < count)
{
if (indices[7] < b_start)
{
reg0[7] = a_global0[indices[7]];
}
else
{
reg0[7] = b_global0[indices[7]];
}
}
index = 8 + tid;
if(index < count)
{
if (indices[8] < b_start)
{
reg0[8] = a_global0[indices[8]];
}
else
{
reg0[8] = b_global0[indices[8]];
}
}
index = 9 + tid;
if(index < count)
{
if (indices[9] < b_start)
{
reg0[9] = a_global0[indices[9]];
}
else
{
reg0[9] = b_global0[indices[9]];
}
}
index = 10 + tid;
if(index < count)
{
if (indices[10] < b_start)
{
reg0[10] = a_global0[indices[10]];
}
else
{
reg0[10] = b_global0[indices[10]];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void transfer_merge_values_shared_1_11_float
(
int count,
int b_start,
local const int * indices_shared,
int tid,
global const float * a_global0,
global const float * b_global0,
global float * dest_global0
)
{
int indices[11];
shared_to_regstr_1_11_int(indices_shared, tid, indices);
float reg0[11];
transfer_merge_values_regstr_1_11_float(count, b_start, indices, tid, a_global0, b_global0, reg0);
regstr_to_global_1_11_float(count, reg0, tid, dest_global0);
}
void device_merge_1_11_int_float
(
int a_count,
int b_count,
global const int * a_keys_global0,
global const int * b_keys_global0,
global int * keys_global0,
local int * keys_shared0,
global const float * a_vals_global0,
global const float * b_vals_global0,
global float * vals_global0,
int tid,
int block,
int4 range,
local int * indices_shared
)
{
int results0[11];
int indices[11];
merge_keys_indices_1_11_int(a_count, b_count, range, tid, indices, a_keys_global0, b_keys_global0, keys_shared0, results0);
thread_to_shared_11_int(results0, tid, keys_shared0);
a_count = range.y - range.x;
b_count = range.w - range.z;
shared_to_global_1_11_int(a_count + b_count, keys_shared0, tid, keys_global0 + 11 * block);
thread_to_shared_11_int(indices, tid, indices_shared);
transfer_merge_values_shared_1_11_float(a_count + b_count, a_count, indices_shared, tid, a_vals_global0 + range.x, b_vals_global0 + range.z, vals_global0 + 11 * block);
}
kernel void merge
(
int a_count,
int b_count,
global const int * a_keys_global0,
global const int * b_keys_global0,
global int * keys_global0,
global const float * a_vals_global0,
global const float * b_vals_global0,
global float * vals_global0,
global const int * mp_global,
int coop
)
{
union Shared
{
struct
{
int keys0[12];
};
int indices[11];
};
local union Shared shared;
int tid = get_local_id(0);
int block = get_group_id(0);
int4 range = compute_merge_range(a_count, b_count, block, coop, 11, mp_global);
device_merge_1_11_int_float(a_count, b_count, a_keys_global0, b_keys_global0, keys_global0, shared.keys0, a_vals_global0, b_vals_global0, vals_global0, tid, block, range, shared.indices);
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a,
int b
)
{
char bit1 = 1 & a; char bit2 = 1 & b; if (bit1 == bit2) return a < b; return bit1 < bit2;
}
int merge_path_int
(
int a_count,
int b_count,
int diag,
global const int * a0,
global const int * b0
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
int4 find_mergesort_frame
(
int coop,
int block,
int nv
)
{
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
int4 frame;
frame.x = nv * start;
frame.y = nv * start + size;
frame.z = size;
return frame;
}
kernel void merge_partition
(
int a_count,
int b_count,
int nv,
int coop,
global int * mp_global,
int num_searches,
global const int * a_global0,
global const int * b_global0
)
{
int partition = get_global_id(0);
if (partition < num_searches)
{
int a0 = 0, b0 = 0;
int gid = nv * partition;
if(coop)
{
int4 frame = find_mergesort_frame(coop, partition, nv);
a0 = frame.x;
b0 = min(a_count, frame.y);
b_count = min(a_count, frame.y + frame.z) - b0;
a_count = min(a_count, frame.x + frame.z) - a0;
gid -= a0;
}
int mp = merge_path_int(a_count, b_count, min(gid, a_count + b_count), a_global0 + a0, b_global0 + b0);
mp_global[partition] = mp;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
float a2,
int b1,
float b2
)
{
return (a1 == b1) ? (a2 < b2) : (a1 < b1);
}
void global_to_regstr_pred_1_7_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
}
void global_to_regstr_1_7_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
if (count >= 7)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
} else global_to_regstr_pred_1_7_float(count, data, tid, reg);
}
void regstr_to_global_1_7_float
(
int count,
const float * reg,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_7_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_7_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_7_float
(
int count,
global const float * source,
int tid,
local float * dest
)
{
float reg[7];
global_to_regstr_1_7_float(count, source, tid, reg);
regstr_to_shared_1_7_float(reg, tid, dest);
}
void shared_to_global_1_7_float
(
int count,
local const float * source,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_7_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[7 * tid + 0];
reg[1] = data[7 * tid + 1];
reg[2] = data[7 * tid + 2];
reg[3] = data[7 * tid + 3];
reg[4] = data[7 * tid + 4];
reg[5] = data[7 * tid + 5];
reg[6] = data[7 * tid + 6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_7_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[7 * tid + 0] = reg[0];
dest[7 * tid + 1] = reg[1];
dest[7 * tid + 2] = reg[2];
dest[7 * tid + 3] = reg[3];
dest[7 * tid + 4] = reg[4];
dest[7 * tid + 5] = reg[5];
dest[7 * tid + 6] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_7_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
}
void global_to_regstr_1_7_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
if (count >= 7)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
} else global_to_regstr_pred_1_7_int(count, data, tid, reg);
}
void regstr_to_global_1_7_int
(
int count,
const int * reg,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_7_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_7_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_7_int
(
int count,
global const int * source,
int tid,
local int * dest
)
{
int reg[7];
global_to_regstr_1_7_int(count, source, tid, reg);
regstr_to_shared_1_7_int(reg, tid, dest);
}
void shared_to_global_1_7_int
(
int count,
local const int * source,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_7_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[7 * tid + 0];
reg[1] = data[7 * tid + 1];
reg[2] = data[7 * tid + 2];
reg[3] = data[7 * tid + 3];
reg[4] = data[7 * tid + 4];
reg[5] = data[7 * tid + 5];
reg[6] = data[7 * tid + 6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_7_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[7 * tid + 0] = reg[0];
dest[7 * tid + 1] = reg[1];
dest[7 * tid + 2] = reg[2];
dest[7 * tid + 3] = reg[3];
dest[7 * tid + 4] = reg[4];
dest[7 * tid + 5] = reg[5];
dest[7 * tid + 6] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void serial_merge_7_int_float
(
int a_begin,
int a_end,
int b_begin,
int b_end,
int * indices,
local const int * keys_shared0,
local const float * keys_shared1,
int * results0,
float * results1
)
{
int a_key0 = keys_shared0[a_begin];
int b_key0 = keys_shared0[b_begin];
float a_key1 = keys_shared1[a_begin];
float b_key1 = keys_shared1[b_begin];
bool p;
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[0] = p ? a_key0 : b_key0;
results1[0] = p ? a_key1 : b_key1;
indices[0] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[1] = p ? a_key0 : b_key0;
results1[1] = p ? a_key1 : b_key1;
indices[1] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[2] = p ? a_key0 : b_key0;
results1[2] = p ? a_key1 : b_key1;
indices[2] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[3] = p ? a_key0 : b_key0;
results1[3] = p ? a_key1 : b_key1;
indices[3] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[4] = p ? a_key0 : b_key0;
results1[4] = p ? a_key1 : b_key1;
indices[4] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[5] = p ? a_key0 : b_key0;
results1[5] = p ? a_key1 : b_key1;
indices[5] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[6] = p ? a_key0 : b_key0;
results1[6] = p ? a_key1 : b_key1;
indices[6] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void swap_int_float
(
int * a0,
float * a1,
int * b0,
float * b1
)
{
{
int c = *a0;
*a0 = *b0;
*b0 = c;
}
{
float c = *a1;
*a1 = *b1;
*b1 = c;
}
}
void odd_even_transpose_sort_7_int_float
(
int * keys0,
float * keys1
)
{
if (comp(keys0[1], keys1[1], keys0[0], keys1[0]))
{
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1);
}
if (comp(keys0[3], keys1[3], keys0[2], keys1[2]))
{
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3);
}
if (comp(keys0[5], keys1[5], keys0[4], keys1[4]))
{
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5);
}
if (comp(keys0[2], keys1[2], keys0[1], keys1[1]))
{
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2);
}
if (comp(keys0[4], keys1[4], keys0[3], keys1[3]))
{
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4);
}
if (comp(keys0[6], keys1[6], keys0[5], keys1[5]))
{
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6);
}
if (comp(keys0[1], keys1[1], keys0[0], keys1[0]))
{
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1);
}
if (comp(keys0[3], keys1[3], keys0[2], keys1[2]))
{
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3);
}
if (comp(keys0[5], keys1[5], keys0[4], keys1[4]))
{
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5);
}
if (comp(keys0[2], keys1[2], keys0[1], keys1[1]))
{
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2);
}
if (comp(keys0[4], keys1[4], keys0[3], keys1[3]))
{
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4);
}
if (comp(keys0[6], keys1[6], keys0[5], keys1[5]))
{
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6);
}
if (comp(keys0[1], keys1[1], keys0[0], keys1[0]))
{
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1);
}
if (comp(keys0[3], keys1[3], keys0[2], keys1[2]))
{
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3);
}
if (comp(keys0[5], keys1[5], keys0[4], keys1[4]))
{
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5);
}
if (comp(keys0[2], keys1[2], keys0[1], keys1[1]))
{
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2);
}
if (comp(keys0[4], keys1[4], keys0[3], keys1[3]))
{
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4);
}
if (comp(keys0[6], keys1[6], keys0[5], keys1[5]))
{
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6);
}
if (comp(keys0[1], keys1[1], keys0[0], keys1[0]))
{
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1);
}
if (comp(keys0[3], keys1[3], keys0[2], keys1[2]))
{
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3);
}
if (comp(keys0[5], keys1[5], keys0[4], keys1[4]))
{
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5);
}
}
int merge_path_int_float
(
int a_count,
int b_count,
int diag,
local const int * a0,
local const float * a1,
local const int * b0,
local const float * b1
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
void block_sort_pass_1_7_int_float
(
int tid,
int count,
int coop,
int * indices,
local const int * keys_shared0,
local const float * keys_shared1,
int * keys0,
float * keys1
)
{
int list = ~(coop - 1) & tid;
int diag = min(count, 7 * ((coop - 1) & tid));
int start = 7 * list;
int a0 = min(count, start);
int b0 = min(count, start + 7 * (coop / 2));
int b1 = min(count, start + 7 * coop);
int p = merge_path_int_float(b0 - a0, b1 - b0, diag, keys_shared0 + a0, keys_shared1 + a0, keys_shared0 + b0, keys_shared1 + b0);
serial_merge_7_int_float(a0 + p, b0, b0 + diag - p, b1, indices, keys_shared0, keys_shared1, keys0, keys1);
}
void block_sort_loop_1_7_int_float
(
int tid,
int count,
local int * keys_shared0,
local float * keys_shared1
)
{
int indices[7];
int keys0[7];
float keys1[7];
}
void mergesort_1_7_int_float
(
int count,
int tid,
int * thread_keys0,
float * thread_keys1,
local int * keys_shared0,
local float * keys_shared1
)
{
if(7 * tid < count) odd_even_transpose_sort_7_int_float(thread_keys0, thread_keys1);
thread_to_shared_7_int(thread_keys0, tid, keys_shared0);
thread_to_shared_7_float(thread_keys1, tid, keys_shared1);
block_sort_loop_1_7_int_float(tid, count, keys_shared0, keys_shared1);
}
kernel void block_sort
(
int count,
global const int * keys_src0,
global const float * keys_src1,
global int * keys_dst0,
global float * keys_dst1
)
{
union Shared
{
struct
{
int keys0[8];
float keys1[8];
};
};
local union Shared shared;
int tid = get_local_id(0);
int block = get_group_id(0);
int gid = 7 * block;
int count2 = min(7, count - gid);
int thread_keys0[7];
float thread_keys1[7];
global_to_shared_1_7_int(count2, keys_src0 + gid, tid, shared.keys0);
global_to_shared_1_7_float(count2, keys_src1 + gid, tid, shared.keys1);
shared_to_thread_7_int(shared.keys0, tid, thread_keys0);
shared_to_thread_7_float(shared.keys1, tid, thread_keys1);
int first = 7 * tid;
if(first + 7 > count2 && first < count2)
{
int max_key0 = thread_keys0[0];
float max_key1 = thread_keys1[0];
if(first + 1 < count2 && comp(max_key0, max_key1, thread_keys0[1], thread_keys1[1]) )
{
max_key0 = thread_keys0[1];
max_key1 = thread_keys1[1];
}
if(first + 2 < count2 && comp(max_key0, max_key1, thread_keys0[2], thread_keys1[2]) )
{
max_key0 = thread_keys0[2];
max_key1 = thread_keys1[2];
}
if(first + 3 < count2 && comp(max_key0, max_key1, thread_keys0[3], thread_keys1[3]) )
{
max_key0 = thread_keys0[3];
max_key1 = thread_keys1[3];
}
if(first + 4 < count2 && comp(max_key0, max_key1, thread_keys0[4], thread_keys1[4]) )
{
max_key0 = thread_keys0[4];
max_key1 = thread_keys1[4];
}
if(first + 5 < count2 && comp(max_key0, max_key1, thread_keys0[5], thread_keys1[5]) )
{
max_key0 = thread_keys0[5];
max_key1 = thread_keys1[5];
}
if(first + 6 < count2 && comp(max_key0, max_key1, thread_keys0[6], thread_keys1[6]) )
{
max_key0 = thread_keys0[6];
max_key1 = thread_keys1[6];
}
if(first + 0 >= count2)
{
thread_keys0[0] = max_key0;
thread_keys1[0] = max_key1;
}
if(first + 1 >= count2)
{
thread_keys0[1] = max_key0;
thread_keys1[1] = max_key1;
}
if(first + 2 >= count2)
{
thread_keys0[2] = max_key0;
thread_keys1[2] = max_key1;
}
if(first + 3 >= count2)
{
thread_keys0[3] = max_key0;
thread_keys1[3] = max_key1;
}
if(first + 4 >= count2)
{
thread_keys0[4] = max_key0;
thread_keys1[4] = max_key1;
}
if(first + 5 >= count2)
{
thread_keys0[5] = max_key0;
thread_keys1[5] = max_key1;
}
if(first + 6 >= count2)
{
thread_keys0[6] = max_key0;
thread_keys1[6] = max_key1;
}
}
mergesort_1_7_int_float(count2, tid, thread_keys0, thread_keys1, shared.keys0, shared.keys1);
shared_to_global_1_7_int(count2, shared.keys0, tid, keys_dst0 + gid);
shared_to_global_1_7_float(count2, shared.keys1, tid, keys_dst1 + gid);
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
float a2,
int b1,
float b2
)
{
return (a1 == b1) ? (a2 < b2) : (a1 < b1);
}
int4 find_mergesort_frame
(
int coop,
int block,
int nv
)
{
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
int4 frame;
frame.x = nv * start;
frame.y = nv * start + size;
frame.z = size;
return frame;
}
int4 find_mergesort_interval
(
int4 frame,
int coop,
int block,
int nv,
int count,
int mp0,
int mp1
)
{
int diag = nv * block - frame.x;
int4 interval;
interval.x = frame.x + mp0;
interval.y = min(count, frame.x + mp1);
interval.z = min(count, frame.y + diag - mp0);
interval.w = min(count, frame.y + diag + nv - mp1);
if(coop - 1 == ((coop - 1) & block))
{
interval.y = min(count, frame.x + frame.z);
interval.w = min(count, frame.y + frame.z);
}
return interval;
}
int4 compute_merge_range
(
int a_count,
int b_count,
int block,
int coop,
int nv,
global const int * mp_global
)
{
int mp0 = mp_global[block];
int mp1 = mp_global[block + 1];
int gid = nv * block;
int4 range;
if(coop)
{
int4 frame = find_mergesort_frame(coop, block, nv);
range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1);
}
else
{
range.x = mp0;
range.y = mp1;
range.z = gid - range.x;
range.w = min(a_count + b_count, gid + nv) - range.y;
}
return range;
}
void global_to_regstr_pred_1_7_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
}
void global_to_regstr_1_7_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
if (count >= 7)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
} else global_to_regstr_pred_1_7_float(count, data, tid, reg);
}
void regstr_to_global_1_7_float
(
int count,
const float * reg,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_7_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_7_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_7_float
(
int count,
global const float * source,
int tid,
local float * dest
)
{
float reg[7];
global_to_regstr_1_7_float(count, source, tid, reg);
regstr_to_shared_1_7_float(reg, tid, dest);
}
void shared_to_global_1_7_float
(
int count,
local const float * source,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_7_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[7 * tid + 0];
reg[1] = data[7 * tid + 1];
reg[2] = data[7 * tid + 2];
reg[3] = data[7 * tid + 3];
reg[4] = data[7 * tid + 4];
reg[5] = data[7 * tid + 5];
reg[6] = data[7 * tid + 6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_7_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[7 * tid + 0] = reg[0];
dest[7 * tid + 1] = reg[1];
dest[7 * tid + 2] = reg[2];
dest[7 * tid + 3] = reg[3];
dest[7 * tid + 4] = reg[4];
dest[7 * tid + 5] = reg[5];
dest[7 * tid + 6] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_7_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
}
void global_to_regstr_1_7_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
if (count >= 7)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
} else global_to_regstr_pred_1_7_int(count, data, tid, reg);
}
void regstr_to_global_1_7_int
(
int count,
const int * reg,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_7_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_7_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_7_int
(
int count,
global const int * source,
int tid,
local int * dest
)
{
int reg[7];
global_to_regstr_1_7_int(count, source, tid, reg);
regstr_to_shared_1_7_int(reg, tid, dest);
}
void shared_to_global_1_7_int
(
int count,
local const int * source,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_7_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[7 * tid + 0];
reg[1] = data[7 * tid + 1];
reg[2] = data[7 * tid + 2];
reg[3] = data[7 * tid + 3];
reg[4] = data[7 * tid + 4];
reg[5] = data[7 * tid + 5];
reg[6] = data[7 * tid + 6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_7_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[7 * tid + 0] = reg[0];
dest[7 * tid + 1] = reg[1];
dest[7 * tid + 2] = reg[2];
dest[7 * tid + 3] = reg[3];
dest[7 * tid + 4] = reg[4];
dest[7 * tid + 5] = reg[5];
dest[7 * tid + 6] = reg[6];
barrier(CLK_LOCAL_MEM_FENCE);
}
void serial_merge_7_int_float
(
int a_begin,
int a_end,
int b_begin,
int b_end,
int * indices,
local const int * keys_shared0,
local const float * keys_shared1,
int * results0,
float * results1
)
{
int a_key0 = keys_shared0[a_begin];
int b_key0 = keys_shared0[b_begin];
float a_key1 = keys_shared1[a_begin];
float b_key1 = keys_shared1[b_begin];
bool p;
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[0] = p ? a_key0 : b_key0;
results1[0] = p ? a_key1 : b_key1;
indices[0] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[1] = p ? a_key0 : b_key0;
results1[1] = p ? a_key1 : b_key1;
indices[1] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[2] = p ? a_key0 : b_key0;
results1[2] = p ? a_key1 : b_key1;
indices[2] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[3] = p ? a_key0 : b_key0;
results1[3] = p ? a_key1 : b_key1;
indices[3] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[4] = p ? a_key0 : b_key0;
results1[4] = p ? a_key1 : b_key1;
indices[4] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[5] = p ? a_key0 : b_key0;
results1[5] = p ? a_key1 : b_key1;
indices[5] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[6] = p ? a_key0 : b_key0;
results1[6] = p ? a_key1 : b_key1;
indices[6] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
int merge_path_int_float
(
int a_count,
int b_count,
int diag,
local const int * a0,
local const float * a1,
local const int * b0,
local const float * b1
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
void load2_to_regstr_1_7_7_float
(
global const float * a_global,
int a_count,
global const float * b_global,
int b_count,
int tid,
float * reg
)
{
b_global -= a_count;
int total = a_count + b_count;
int index;
if (total >= 7)
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else reg[6] = b_global[index];
}
else
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else if (index < total) reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else if (index < total) reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else if (index < total) reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else if (index < total) reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else if (index < total) reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else if (index < total) reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else if (index < total) reg[6] = b_global[index];
}
}
void load2_to_shared_1_7_7_float
(
global const float * a_global,
int a_count,
global const float * b_global,
int b_count,
int tid,
local float * shared
)
{
float reg[7];
load2_to_regstr_1_7_7_float(a_global, a_count, b_global, b_count, tid, reg);
regstr_to_shared_1_7_float(reg, tid, shared);
}
void load2_to_regstr_1_7_7_int
(
global const int * a_global,
int a_count,
global const int * b_global,
int b_count,
int tid,
int * reg
)
{
b_global -= a_count;
int total = a_count + b_count;
int index;
if (total >= 7)
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else reg[6] = b_global[index];
}
else
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else if (index < total) reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else if (index < total) reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else if (index < total) reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else if (index < total) reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else if (index < total) reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else if (index < total) reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else if (index < total) reg[6] = b_global[index];
}
}
void load2_to_shared_1_7_7_int
(
global const int * a_global,
int a_count,
global const int * b_global,
int b_count,
int tid,
local int * shared
)
{
int reg[7];
load2_to_regstr_1_7_7_int(a_global, a_count, b_global, b_count, tid, reg);
regstr_to_shared_1_7_int(reg, tid, shared);
}
void merge_keys_indices_1_7_int_float
(
int a_count,
int b_count,
int4 range,
int tid,
int * indices,
global const int * a_global0,
global const float * a_global1,
global const int * b_global0,
global const float * b_global1,
local int * keys_shared0,
local float * keys_shared1,
int * results0,
float * results1
)
{
int a0 = range.x;
int a1 = range.y;
int b0 = range.z;
int b1 = range.w;
a_count = a1 - a0;
b_count = b1 - b0;
load2_to_shared_1_7_7_int(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0);
load2_to_shared_1_7_7_float(a_global1 + a0, a_count, b_global1 + b0, b_count, tid, keys_shared1);
int diag = 7 * tid;
int mp = merge_path_int_float(a_count, b_count, diag, keys_shared0, keys_shared1, keys_shared0 + a_count, keys_shared1 + a_count);
int a0tid = mp;
int a1tid = a_count;
int b0tid = a_count + diag - mp;
int b1tid = a_count + b_count;
serial_merge_7_int_float(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, keys_shared1, results0, results1);
}
void device_merge_1_7_int_float
(
int a_count,
int b_count,
global const int * a_keys_global0,
global const float * a_keys_global1,
global const int * b_keys_global0,
global const float * b_keys_global1,
global int * keys_global0,
global float * keys_global1,
local int * keys_shared0,
local float * keys_shared1,
int tid,
int block,
int4 range,
local int * indices_shared
)
{
int results0[7];
float results1[7];
int indices[7];
merge_keys_indices_1_7_int_float(a_count, b_count, range, tid, indices, a_keys_global0, a_keys_global1, b_keys_global0, b_keys_global1, keys_shared0, keys_shared1, results0, results1);
thread_to_shared_7_int(results0, tid, keys_shared0);
thread_to_shared_7_float(results1, tid, keys_shared1);
a_count = range.y - range.x;
b_count = range.w - range.z;
shared_to_global_1_7_int(a_count + b_count, keys_shared0, tid, keys_global0 + 7 * block);
shared_to_global_1_7_float(a_count + b_count, keys_shared1, tid, keys_global1 + 7 * block);
}
kernel void merge
(
int a_count,
int b_count,
global const int * a_keys_global0,
global const float * a_keys_global1,
global const int * b_keys_global0,
global const float * b_keys_global1,
global int * keys_global0,
global float * keys_global1,
global const int * mp_global,
int coop
)
{
union Shared
{
struct
{
int keys0[8];
float keys1[8];
};
int indices[7];
};
local union Shared shared;
int tid = get_local_id(0);
int block = get_group_id(0);
int4 range = compute_merge_range(a_count, b_count, block, coop, 7, mp_global);
device_merge_1_7_int_float(a_count, b_count, a_keys_global0, a_keys_global1, b_keys_global0, b_keys_global1, keys_global0, keys_global1, shared.keys0, shared.keys1, tid, block, range, shared.indices);
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
float a2,
int b1,
float b2
)
{
return (a1 == b1) ? (a2 < b2) : (a1 < b1);
}
int merge_path_int_float
(
int a_count,
int b_count,
int diag,
global const int * a0,
global const float * a1,
global const int * b0,
global const float * b1
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
int4 find_mergesort_frame
(
int coop,
int block,
int nv
)
{
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
int4 frame;
frame.x = nv * start;
frame.y = nv * start + size;
frame.z = size;
return frame;
}
kernel void merge_partition
(
int a_count,
int b_count,
int nv,
int coop,
global int * mp_global,
int num_searches,
global const int * a_global0,
global const float * a_global1,
global const int * b_global0,
global const float * b_global1
)
{
int partition = get_global_id(0);
if (partition < num_searches)
{
int a0 = 0, b0 = 0;
int gid = nv * partition;
if(coop)
{
int4 frame = find_mergesort_frame(coop, partition, nv);
a0 = frame.x;
b0 = min(a_count, frame.y);
b_count = min(a_count, frame.y + frame.z) - b0;
a_count = min(a_count, frame.x + frame.z) - a0;
gid -= a0;
}
int mp = merge_path_int_float(a_count, b_count, min(gid, a_count + b_count), a_global0 + a0, a_global1 + a0, b_global0 + b0, b_global1 + b0);
mp_global[partition] = mp;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
float a2,
int b1,
float b2
)
{
return (a1 == b1) ? (a2 < b2) : (a1 < b1);
}
void global_to_regstr_pred_1_11_short
(
int count,
global const short * data,
int tid,
short * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_short
(
int count,
global const short * data,
int tid,
short * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_short(count, data, tid, reg);
}
void regstr_to_global_1_11_short
(
int count,
const short * reg,
int tid,
global short * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_short
(
local const short * data,
int tid,
short * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_short
(
const short * reg,
int tid,
local short * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_short
(
int count,
global const short * source,
int tid,
local short * dest
)
{
short reg[11];
global_to_regstr_1_11_short(count, source, tid, reg);
regstr_to_shared_1_11_short(reg, tid, dest);
}
void shared_to_global_1_11_short
(
int count,
local const short * source,
int tid,
global short * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_short
(
local const short * data,
int tid,
short * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_short
(
const short * reg,
int tid,
local short * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_long
(
int count,
global const long * data,
int tid,
long * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_long
(
int count,
global const long * data,
int tid,
long * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_long(count, data, tid, reg);
}
void regstr_to_global_1_11_long
(
int count,
const long * reg,
int tid,
global long * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_long
(
local const long * data,
int tid,
long * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_long
(
const long * reg,
int tid,
local long * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_long
(
int count,
global const long * source,
int tid,
local long * dest
)
{
long reg[11];
global_to_regstr_1_11_long(count, source, tid, reg);
regstr_to_shared_1_11_long(reg, tid, dest);
}
void shared_to_global_1_11_long
(
int count,
local const long * source,
int tid,
global long * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_long
(
local const long * data,
int tid,
long * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_long
(
const long * reg,
int tid,
local long * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_float(count, data, tid, reg);
}
void regstr_to_global_1_11_float
(
int count,
const float * reg,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_float
(
int count,
global const float * source,
int tid,
local float * dest
)
{
float reg[11];
global_to_regstr_1_11_float(count, source, tid, reg);
regstr_to_shared_1_11_float(reg, tid, dest);
}
void shared_to_global_1_11_float
(
int count,
local const float * source,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_int(count, data, tid, reg);
}
void regstr_to_global_1_11_int
(
int count,
const int * reg,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_int
(
int count,
global const int * source,
int tid,
local int * dest
)
{
int reg[11];
global_to_regstr_1_11_int(count, source, tid, reg);
regstr_to_shared_1_11_int(reg, tid, dest);
}
void shared_to_global_1_11_int
(
int count,
local const int * source,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void serial_merge_11_int_float
(
int a_begin,
int a_end,
int b_begin,
int b_end,
int * indices,
local const int * keys_shared0,
local const float * keys_shared1,
int * results0,
float * results1
)
{
int a_key0 = keys_shared0[a_begin];
int b_key0 = keys_shared0[b_begin];
float a_key1 = keys_shared1[a_begin];
float b_key1 = keys_shared1[b_begin];
bool p;
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[0] = p ? a_key0 : b_key0;
results1[0] = p ? a_key1 : b_key1;
indices[0] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[1] = p ? a_key0 : b_key0;
results1[1] = p ? a_key1 : b_key1;
indices[1] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[2] = p ? a_key0 : b_key0;
results1[2] = p ? a_key1 : b_key1;
indices[2] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[3] = p ? a_key0 : b_key0;
results1[3] = p ? a_key1 : b_key1;
indices[3] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[4] = p ? a_key0 : b_key0;
results1[4] = p ? a_key1 : b_key1;
indices[4] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[5] = p ? a_key0 : b_key0;
results1[5] = p ? a_key1 : b_key1;
indices[5] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[6] = p ? a_key0 : b_key0;
results1[6] = p ? a_key1 : b_key1;
indices[6] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[7] = p ? a_key0 : b_key0;
results1[7] = p ? a_key1 : b_key1;
indices[7] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[8] = p ? a_key0 : b_key0;
results1[8] = p ? a_key1 : b_key1;
indices[8] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[9] = p ? a_key0 : b_key0;
results1[9] = p ? a_key1 : b_key1;
indices[9] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[10] = p ? a_key0 : b_key0;
results1[10] = p ? a_key1 : b_key1;
indices[10] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void swap_int_float
(
int * a0,
float * a1,
int * b0,
float * b1
)
{
{
int c = *a0;
*a0 = *b0;
*b0 = c;
}
{
float c = *a1;
*a1 = *b1;
*b1 = c;
}
}
void swap_long_short
(
long * a0,
short * a1,
long * b0,
short * b1
)
{
{
long c = *a0;
*a0 = *b0;
*b0 = c;
}
{
short c = *a1;
*a1 = *b1;
*b1 = c;
}
}
void odd_even_transpose_sort_11_int_float_long_short
(
int * keys0,
float * keys1,
long * vals0,
short * vals1
)
{
if (comp(keys0[1], keys1[1], keys0[0], keys1[0]))
{
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1);
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1);
}
if (comp(keys0[3], keys1[3], keys0[2], keys1[2]))
{
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3);
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3);
}
if (comp(keys0[5], keys1[5], keys0[4], keys1[4]))
{
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5);
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5);
}
if (comp(keys0[7], keys1[7], keys0[6], keys1[6]))
{
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7);
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7);
}
if (comp(keys0[9], keys1[9], keys0[8], keys1[8]))
{
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9);
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9);
}
if (comp(keys0[2], keys1[2], keys0[1], keys1[1]))
{
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2);
swap_long_short(vals0 + 1, vals1 + 1, vals0 + 2, vals1 + 2);
}
if (comp(keys0[4], keys1[4], keys0[3], keys1[3]))
{
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4);
swap_long_short(vals0 + 3, vals1 + 3, vals0 + 4, vals1 + 4);
}
if (comp(keys0[6], keys1[6], keys0[5], keys1[5]))
{
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6);
swap_long_short(vals0 + 5, vals1 + 5, vals0 + 6, vals1 + 6);
}
if (comp(keys0[8], keys1[8], keys0[7], keys1[7]))
{
swap_int_float(keys0 + 7, keys1 + 7, keys0 + 8, keys1 + 8);
swap_long_short(vals0 + 7, vals1 + 7, vals0 + 8, vals1 + 8);
}
if (comp(keys0[10], keys1[10], keys0[9], keys1[9]))
{
swap_int_float(keys0 + 9, keys1 + 9, keys0 + 10, keys1 + 10);
swap_long_short(vals0 + 9, vals1 + 9, vals0 + 10, vals1 + 10);
}
if (comp(keys0[1], keys1[1], keys0[0], keys1[0]))
{
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1);
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1);
}
if (comp(keys0[3], keys1[3], keys0[2], keys1[2]))
{
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3);
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3);
}
if (comp(keys0[5], keys1[5], keys0[4], keys1[4]))
{
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5);
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5);
}
if (comp(keys0[7], keys1[7], keys0[6], keys1[6]))
{
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7);
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7);
}
if (comp(keys0[9], keys1[9], keys0[8], keys1[8]))
{
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9);
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9);
}
if (comp(keys0[2], keys1[2], keys0[1], keys1[1]))
{
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2);
swap_long_short(vals0 + 1, vals1 + 1, vals0 + 2, vals1 + 2);
}
if (comp(keys0[4], keys1[4], keys0[3], keys1[3]))
{
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4);
swap_long_short(vals0 + 3, vals1 + 3, vals0 + 4, vals1 + 4);
}
if (comp(keys0[6], keys1[6], keys0[5], keys1[5]))
{
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6);
swap_long_short(vals0 + 5, vals1 + 5, vals0 + 6, vals1 + 6);
}
if (comp(keys0[8], keys1[8], keys0[7], keys1[7]))
{
swap_int_float(keys0 + 7, keys1 + 7, keys0 + 8, keys1 + 8);
swap_long_short(vals0 + 7, vals1 + 7, vals0 + 8, vals1 + 8);
}
if (comp(keys0[10], keys1[10], keys0[9], keys1[9]))
{
swap_int_float(keys0 + 9, keys1 + 9, keys0 + 10, keys1 + 10);
swap_long_short(vals0 + 9, vals1 + 9, vals0 + 10, vals1 + 10);
}
if (comp(keys0[1], keys1[1], keys0[0], keys1[0]))
{
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1);
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1);
}
if (comp(keys0[3], keys1[3], keys0[2], keys1[2]))
{
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3);
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3);
}
if (comp(keys0[5], keys1[5], keys0[4], keys1[4]))
{
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5);
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5);
}
if (comp(keys0[7], keys1[7], keys0[6], keys1[6]))
{
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7);
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7);
}
if (comp(keys0[9], keys1[9], keys0[8], keys1[8]))
{
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9);
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9);
}
if (comp(keys0[2], keys1[2], keys0[1], keys1[1]))
{
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2);
swap_long_short(vals0 + 1, vals1 + 1, vals0 + 2, vals1 + 2);
}
if (comp(keys0[4], keys1[4], keys0[3], keys1[3]))
{
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4);
swap_long_short(vals0 + 3, vals1 + 3, vals0 + 4, vals1 + 4);
}
if (comp(keys0[6], keys1[6], keys0[5], keys1[5]))
{
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6);
swap_long_short(vals0 + 5, vals1 + 5, vals0 + 6, vals1 + 6);
}
if (comp(keys0[8], keys1[8], keys0[7], keys1[7]))
{
swap_int_float(keys0 + 7, keys1 + 7, keys0 + 8, keys1 + 8);
swap_long_short(vals0 + 7, vals1 + 7, vals0 + 8, vals1 + 8);
}
if (comp(keys0[10], keys1[10], keys0[9], keys1[9]))
{
swap_int_float(keys0 + 9, keys1 + 9, keys0 + 10, keys1 + 10);
swap_long_short(vals0 + 9, vals1 + 9, vals0 + 10, vals1 + 10);
}
if (comp(keys0[1], keys1[1], keys0[0], keys1[0]))
{
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1);
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1);
}
if (comp(keys0[3], keys1[3], keys0[2], keys1[2]))
{
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3);
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3);
}
if (comp(keys0[5], keys1[5], keys0[4], keys1[4]))
{
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5);
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5);
}
if (comp(keys0[7], keys1[7], keys0[6], keys1[6]))
{
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7);
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7);
}
if (comp(keys0[9], keys1[9], keys0[8], keys1[8]))
{
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9);
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9);
}
if (comp(keys0[2], keys1[2], keys0[1], keys1[1]))
{
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2);
swap_long_short(vals0 + 1, vals1 + 1, vals0 + 2, vals1 + 2);
}
if (comp(keys0[4], keys1[4], keys0[3], keys1[3]))
{
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4);
swap_long_short(vals0 + 3, vals1 + 3, vals0 + 4, vals1 + 4);
}
if (comp(keys0[6], keys1[6], keys0[5], keys1[5]))
{
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6);
swap_long_short(vals0 + 5, vals1 + 5, vals0 + 6, vals1 + 6);
}
if (comp(keys0[8], keys1[8], keys0[7], keys1[7]))
{
swap_int_float(keys0 + 7, keys1 + 7, keys0 + 8, keys1 + 8);
swap_long_short(vals0 + 7, vals1 + 7, vals0 + 8, vals1 + 8);
}
if (comp(keys0[10], keys1[10], keys0[9], keys1[9]))
{
swap_int_float(keys0 + 9, keys1 + 9, keys0 + 10, keys1 + 10);
swap_long_short(vals0 + 9, vals1 + 9, vals0 + 10, vals1 + 10);
}
if (comp(keys0[1], keys1[1], keys0[0], keys1[0]))
{
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1);
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1);
}
if (comp(keys0[3], keys1[3], keys0[2], keys1[2]))
{
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3);
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3);
}
if (comp(keys0[5], keys1[5], keys0[4], keys1[4]))
{
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5);
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5);
}
if (comp(keys0[7], keys1[7], keys0[6], keys1[6]))
{
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7);
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7);
}
if (comp(keys0[9], keys1[9], keys0[8], keys1[8]))
{
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9);
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9);
}
if (comp(keys0[2], keys1[2], keys0[1], keys1[1]))
{
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2);
swap_long_short(vals0 + 1, vals1 + 1, vals0 + 2, vals1 + 2);
}
if (comp(keys0[4], keys1[4], keys0[3], keys1[3]))
{
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4);
swap_long_short(vals0 + 3, vals1 + 3, vals0 + 4, vals1 + 4);
}
if (comp(keys0[6], keys1[6], keys0[5], keys1[5]))
{
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6);
swap_long_short(vals0 + 5, vals1 + 5, vals0 + 6, vals1 + 6);
}
if (comp(keys0[8], keys1[8], keys0[7], keys1[7]))
{
swap_int_float(keys0 + 7, keys1 + 7, keys0 + 8, keys1 + 8);
swap_long_short(vals0 + 7, vals1 + 7, vals0 + 8, vals1 + 8);
}
if (comp(keys0[10], keys1[10], keys0[9], keys1[9]))
{
swap_int_float(keys0 + 9, keys1 + 9, keys0 + 10, keys1 + 10);
swap_long_short(vals0 + 9, vals1 + 9, vals0 + 10, vals1 + 10);
}
if (comp(keys0[1], keys1[1], keys0[0], keys1[0]))
{
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1);
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1);
}
if (comp(keys0[3], keys1[3], keys0[2], keys1[2]))
{
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3);
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3);
}
if (comp(keys0[5], keys1[5], keys0[4], keys1[4]))
{
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5);
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5);
}
if (comp(keys0[7], keys1[7], keys0[6], keys1[6]))
{
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7);
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7);
}
if (comp(keys0[9], keys1[9], keys0[8], keys1[8]))
{
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9);
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9);
}
}
int merge_path_int_float
(
int a_count,
int b_count,
int diag,
local const int * a0,
local const float * a1,
local const int * b0,
local const float * b1
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
void block_sort_pass_1_11_int_float
(
int tid,
int count,
int coop,
int * indices,
local const int * keys_shared0,
local const float * keys_shared1,
int * keys0,
float * keys1
)
{
int list = ~(coop - 1) & tid;
int diag = min(count, 11 * ((coop - 1) & tid));
int start = 11 * list;
int a0 = min(count, start);
int b0 = min(count, start + 11 * (coop / 2));
int b1 = min(count, start + 11 * coop);
int p = merge_path_int_float(b0 - a0, b1 - b0, diag, keys_shared0 + a0, keys_shared1 + a0, keys_shared0 + b0, keys_shared1 + b0);
serial_merge_11_int_float(a0 + p, b0, b0 + diag - p, b1, indices, keys_shared0, keys_shared1, keys0, keys1);
}
void gather_1_11_long_short
(
const int * indices,
int tid,
local const long * data0,
local const short * data1,
long * reg0,
short * reg1
)
{
reg0[0] = data0[indices[0]];
reg1[0] = data1[indices[0]];
reg0[1] = data0[indices[1]];
reg1[1] = data1[indices[1]];
reg0[2] = data0[indices[2]];
reg1[2] = data1[indices[2]];
reg0[3] = data0[indices[3]];
reg1[3] = data1[indices[3]];
reg0[4] = data0[indices[4]];
reg1[4] = data1[indices[4]];
reg0[5] = data0[indices[5]];
reg1[5] = data1[indices[5]];
reg0[6] = data0[indices[6]];
reg1[6] = data1[indices[6]];
reg0[7] = data0[indices[7]];
reg1[7] = data1[indices[7]];
reg0[8] = data0[indices[8]];
reg1[8] = data1[indices[8]];
reg0[9] = data0[indices[9]];
reg1[9] = data1[indices[9]];
reg0[10] = data0[indices[10]];
reg1[10] = data1[indices[10]];
barrier(CLK_LOCAL_MEM_FENCE);
}
void block_sort_loop_1_11_int_float_long_short
(
int tid,
int count,
local int * keys_shared0,
local float * keys_shared1,
long * thread_vals0,
short * thread_vals1,
local long * vals_shared0,
local short * vals_shared1
)
{
int indices[11];
int keys0[11];
float keys1[11];
}
void mergesort_1_11_int_float_long_short
(
int count,
int tid,
int * thread_keys0,
float * thread_keys1,
local int * keys_shared0,
local float * keys_shared1,
long * thread_vals0,
short * thread_vals1,
local long * vals_shared0,
local short * vals_shared1
)
{
if(11 * tid < count) odd_even_transpose_sort_11_int_float_long_short(thread_keys0, thread_keys1, thread_vals0, thread_vals1);
thread_to_shared_11_int(thread_keys0, tid, keys_shared0);
thread_to_shared_11_float(thread_keys1, tid, keys_shared1);
block_sort_loop_1_11_int_float_long_short(tid, count, keys_shared0, keys_shared1, thread_vals0, thread_vals1, vals_shared0, vals_shared1);
}
kernel void block_sort
(
int count,
global const int * keys_src0,
global const float * keys_src1,
global int * keys_dst0,
global float * keys_dst1,
global const long * vals_src0,
global const short * vals_src1,
global long * vals_dst0,
global short * vals_dst1
)
{
union Shared
{
struct
{
int keys0[12];
float keys1[12];
};
struct
{
long vals0[11];
short vals1[11];
};
};
local union Shared shared;
int tid = get_local_id(0);
int block = get_group_id(0);
int gid = 11 * block;
int count2 = min(11, count - gid);
long thread_vals0[11];
short thread_vals1[11];
global_to_shared_1_11_long(count2, vals_src0 + gid, tid, shared.vals0);
global_to_shared_1_11_short(count2, vals_src1 + gid, tid, shared.vals1);
shared_to_thread_11_long(shared.vals0, tid, thread_vals0);
shared_to_thread_11_short(shared.vals1, tid, thread_vals1);
int thread_keys0[11];
float thread_keys1[11];
global_to_shared_1_11_int(count2, keys_src0 + gid, tid, shared.keys0);
global_to_shared_1_11_float(count2, keys_src1 + gid, tid, shared.keys1);
shared_to_thread_11_int(shared.keys0, tid, thread_keys0);
shared_to_thread_11_float(shared.keys1, tid, thread_keys1);
int first = 11 * tid;
if(first + 11 > count2 && first < count2)
{
int max_key0 = thread_keys0[0];
float max_key1 = thread_keys1[0];
if(first + 1 < count2 && comp(max_key0, max_key1, thread_keys0[1], thread_keys1[1]) )
{
max_key0 = thread_keys0[1];
max_key1 = thread_keys1[1];
}
if(first + 2 < count2 && comp(max_key0, max_key1, thread_keys0[2], thread_keys1[2]) )
{
max_key0 = thread_keys0[2];
max_key1 = thread_keys1[2];
}
if(first + 3 < count2 && comp(max_key0, max_key1, thread_keys0[3], thread_keys1[3]) )
{
max_key0 = thread_keys0[3];
max_key1 = thread_keys1[3];
}
if(first + 4 < count2 && comp(max_key0, max_key1, thread_keys0[4], thread_keys1[4]) )
{
max_key0 = thread_keys0[4];
max_key1 = thread_keys1[4];
}
if(first + 5 < count2 && comp(max_key0, max_key1, thread_keys0[5], thread_keys1[5]) )
{
max_key0 = thread_keys0[5];
max_key1 = thread_keys1[5];
}
if(first + 6 < count2 && comp(max_key0, max_key1, thread_keys0[6], thread_keys1[6]) )
{
max_key0 = thread_keys0[6];
max_key1 = thread_keys1[6];
}
if(first + 7 < count2 && comp(max_key0, max_key1, thread_keys0[7], thread_keys1[7]) )
{
max_key0 = thread_keys0[7];
max_key1 = thread_keys1[7];
}
if(first + 8 < count2 && comp(max_key0, max_key1, thread_keys0[8], thread_keys1[8]) )
{
max_key0 = thread_keys0[8];
max_key1 = thread_keys1[8];
}
if(first + 9 < count2 && comp(max_key0, max_key1, thread_keys0[9], thread_keys1[9]) )
{
max_key0 = thread_keys0[9];
max_key1 = thread_keys1[9];
}
if(first + 10 < count2 && comp(max_key0, max_key1, thread_keys0[10], thread_keys1[10]) )
{
max_key0 = thread_keys0[10];
max_key1 = thread_keys1[10];
}
if(first + 0 >= count2)
{
thread_keys0[0] = max_key0;
thread_keys1[0] = max_key1;
}
if(first + 1 >= count2)
{
thread_keys0[1] = max_key0;
thread_keys1[1] = max_key1;
}
if(first + 2 >= count2)
{
thread_keys0[2] = max_key0;
thread_keys1[2] = max_key1;
}
if(first + 3 >= count2)
{
thread_keys0[3] = max_key0;
thread_keys1[3] = max_key1;
}
if(first + 4 >= count2)
{
thread_keys0[4] = max_key0;
thread_keys1[4] = max_key1;
}
if(first + 5 >= count2)
{
thread_keys0[5] = max_key0;
thread_keys1[5] = max_key1;
}
if(first + 6 >= count2)
{
thread_keys0[6] = max_key0;
thread_keys1[6] = max_key1;
}
if(first + 7 >= count2)
{
thread_keys0[7] = max_key0;
thread_keys1[7] = max_key1;
}
if(first + 8 >= count2)
{
thread_keys0[8] = max_key0;
thread_keys1[8] = max_key1;
}
if(first + 9 >= count2)
{
thread_keys0[9] = max_key0;
thread_keys1[9] = max_key1;
}
if(first + 10 >= count2)
{
thread_keys0[10] = max_key0;
thread_keys1[10] = max_key1;
}
}
mergesort_1_11_int_float_long_short(count2, tid, thread_keys0, thread_keys1, shared.keys0, shared.keys1, thread_vals0, thread_vals1, shared.vals0, shared.vals1);
shared_to_global_1_11_int(count2, shared.keys0, tid, keys_dst0 + gid);
shared_to_global_1_11_float(count2, shared.keys1, tid, keys_dst1 + gid);
thread_to_shared_11_long(thread_vals0, tid, shared.vals0);
thread_to_shared_11_short(thread_vals1, tid, shared.vals1);
shared_to_global_1_11_long(count2, shared.vals0, tid, vals_dst0 + gid);
shared_to_global_1_11_short(count2, shared.vals1, tid, vals_dst1 + gid);
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
float a2,
int b1,
float b2
)
{
return (a1 == b1) ? (a2 < b2) : (a1 < b1);
}
int4 find_mergesort_frame
(
int coop,
int block,
int nv
)
{
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
int4 frame;
frame.x = nv * start;
frame.y = nv * start + size;
frame.z = size;
return frame;
}
int4 find_mergesort_interval
(
int4 frame,
int coop,
int block,
int nv,
int count,
int mp0,
int mp1
)
{
int diag = nv * block - frame.x;
int4 interval;
interval.x = frame.x + mp0;
interval.y = min(count, frame.x + mp1);
interval.z = min(count, frame.y + diag - mp0);
interval.w = min(count, frame.y + diag + nv - mp1);
if(coop - 1 == ((coop - 1) & block))
{
interval.y = min(count, frame.x + frame.z);
interval.w = min(count, frame.y + frame.z);
}
return interval;
}
int4 compute_merge_range
(
int a_count,
int b_count,
int block,
int coop,
int nv,
global const int * mp_global
)
{
int mp0 = mp_global[block];
int mp1 = mp_global[block + 1];
int gid = nv * block;
int4 range;
if(coop)
{
int4 frame = find_mergesort_frame(coop, block, nv);
range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1);
}
else
{
range.x = mp0;
range.y = mp1;
range.z = gid - range.x;
range.w = min(a_count + b_count, gid + nv) - range.y;
}
return range;
}
void global_to_regstr_pred_1_11_short
(
int count,
global const short * data,
int tid,
short * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_short
(
int count,
global const short * data,
int tid,
short * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_short(count, data, tid, reg);
}
void regstr_to_global_1_11_short
(
int count,
const short * reg,
int tid,
global short * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_short
(
local const short * data,
int tid,
short * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_short
(
const short * reg,
int tid,
local short * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_short
(
int count,
global const short * source,
int tid,
local short * dest
)
{
short reg[11];
global_to_regstr_1_11_short(count, source, tid, reg);
regstr_to_shared_1_11_short(reg, tid, dest);
}
void shared_to_global_1_11_short
(
int count,
local const short * source,
int tid,
global short * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_short
(
local const short * data,
int tid,
short * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_short
(
const short * reg,
int tid,
local short * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_long
(
int count,
global const long * data,
int tid,
long * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_long
(
int count,
global const long * data,
int tid,
long * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_long(count, data, tid, reg);
}
void regstr_to_global_1_11_long
(
int count,
const long * reg,
int tid,
global long * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_long
(
local const long * data,
int tid,
long * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_long
(
const long * reg,
int tid,
local long * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_long
(
int count,
global const long * source,
int tid,
local long * dest
)
{
long reg[11];
global_to_regstr_1_11_long(count, source, tid, reg);
regstr_to_shared_1_11_long(reg, tid, dest);
}
void shared_to_global_1_11_long
(
int count,
local const long * source,
int tid,
global long * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_long
(
local const long * data,
int tid,
long * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_long
(
const long * reg,
int tid,
local long * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_float
(
int count,
global const float * data,
int tid,
float * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_float(count, data, tid, reg);
}
void regstr_to_global_1_11_float
(
int count,
const float * reg,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_float
(
int count,
global const float * source,
int tid,
local float * dest
)
{
float reg[11];
global_to_regstr_1_11_float(count, source, tid, reg);
regstr_to_shared_1_11_float(reg, tid, dest);
}
void shared_to_global_1_11_float
(
int count,
local const float * source,
int tid,
global float * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_float
(
local const float * data,
int tid,
float * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_float
(
const float * reg,
int tid,
local float * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_regstr_pred_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
int index;
index = 0 + tid;
if (index < count) reg[0] = data[index];
index = 1 + tid;
if (index < count) reg[1] = data[index];
index = 2 + tid;
if (index < count) reg[2] = data[index];
index = 3 + tid;
if (index < count) reg[3] = data[index];
index = 4 + tid;
if (index < count) reg[4] = data[index];
index = 5 + tid;
if (index < count) reg[5] = data[index];
index = 6 + tid;
if (index < count) reg[6] = data[index];
index = 7 + tid;
if (index < count) reg[7] = data[index];
index = 8 + tid;
if (index < count) reg[8] = data[index];
index = 9 + tid;
if (index < count) reg[9] = data[index];
index = 10 + tid;
if (index < count) reg[10] = data[index];
}
void global_to_regstr_1_11_int
(
int count,
global const int * data,
int tid,
int * reg
)
{
if (count >= 11)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
} else global_to_regstr_pred_1_11_int(count, data, tid, reg);
}
void regstr_to_global_1_11_int
(
int count,
const int * reg,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = reg[0];
index = 1 + tid;
if (index < count) dest[index] = reg[1];
index = 2 + tid;
if (index < count) dest[index] = reg[2];
index = 3 + tid;
if (index < count) dest[index] = reg[3];
index = 4 + tid;
if (index < count) dest[index] = reg[4];
index = 5 + tid;
if (index < count) dest[index] = reg[5];
index = 6 + tid;
if (index < count) dest[index] = reg[6];
index = 7 + tid;
if (index < count) dest[index] = reg[7];
index = 8 + tid;
if (index < count) dest[index] = reg[8];
index = 9 + tid;
if (index < count) dest[index] = reg[9];
index = 10 + tid;
if (index < count) dest[index] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_regstr_1_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[0 + tid];
reg[1] = data[1 + tid];
reg[2] = data[2 + tid];
reg[3] = data[3 + tid];
reg[4] = data[4 + tid];
reg[5] = data[5 + tid];
reg[6] = data[6 + tid];
reg[7] = data[7 + tid];
reg[8] = data[8 + tid];
reg[9] = data[9 + tid];
reg[10] = data[10 + tid];
barrier(CLK_LOCAL_MEM_FENCE);
}
void regstr_to_shared_1_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[0 + tid] = reg[0];
dest[1 + tid] = reg[1];
dest[2 + tid] = reg[2];
dest[3 + tid] = reg[3];
dest[4 + tid] = reg[4];
dest[5 + tid] = reg[5];
dest[6 + tid] = reg[6];
dest[7 + tid] = reg[7];
dest[8 + tid] = reg[8];
dest[9 + tid] = reg[9];
dest[10 + tid] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void global_to_shared_1_11_int
(
int count,
global const int * source,
int tid,
local int * dest
)
{
int reg[11];
global_to_regstr_1_11_int(count, source, tid, reg);
regstr_to_shared_1_11_int(reg, tid, dest);
}
void shared_to_global_1_11_int
(
int count,
local const int * source,
int tid,
global int * dest
)
{
int index;
index = 0 + tid;
if (index < count) dest[index] = source[index];
index = 1 + tid;
if (index < count) dest[index] = source[index];
index = 2 + tid;
if (index < count) dest[index] = source[index];
index = 3 + tid;
if (index < count) dest[index] = source[index];
index = 4 + tid;
if (index < count) dest[index] = source[index];
index = 5 + tid;
if (index < count) dest[index] = source[index];
index = 6 + tid;
if (index < count) dest[index] = source[index];
index = 7 + tid;
if (index < count) dest[index] = source[index];
index = 8 + tid;
if (index < count) dest[index] = source[index];
index = 9 + tid;
if (index < count) dest[index] = source[index];
index = 10 + tid;
if (index < count) dest[index] = source[index];
barrier(CLK_LOCAL_MEM_FENCE);
}
void shared_to_thread_11_int
(
local const int * data,
int tid,
int * reg
)
{
reg[0] = data[11 * tid + 0];
reg[1] = data[11 * tid + 1];
reg[2] = data[11 * tid + 2];
reg[3] = data[11 * tid + 3];
reg[4] = data[11 * tid + 4];
reg[5] = data[11 * tid + 5];
reg[6] = data[11 * tid + 6];
reg[7] = data[11 * tid + 7];
reg[8] = data[11 * tid + 8];
reg[9] = data[11 * tid + 9];
reg[10] = data[11 * tid + 10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void thread_to_shared_11_int
(
const int * reg,
int tid,
local int * dest
)
{
dest[11 * tid + 0] = reg[0];
dest[11 * tid + 1] = reg[1];
dest[11 * tid + 2] = reg[2];
dest[11 * tid + 3] = reg[3];
dest[11 * tid + 4] = reg[4];
dest[11 * tid + 5] = reg[5];
dest[11 * tid + 6] = reg[6];
dest[11 * tid + 7] = reg[7];
dest[11 * tid + 8] = reg[8];
dest[11 * tid + 9] = reg[9];
dest[11 * tid + 10] = reg[10];
barrier(CLK_LOCAL_MEM_FENCE);
}
void serial_merge_11_int_float
(
int a_begin,
int a_end,
int b_begin,
int b_end,
int * indices,
local const int * keys_shared0,
local const float * keys_shared1,
int * results0,
float * results1
)
{
int a_key0 = keys_shared0[a_begin];
int b_key0 = keys_shared0[b_begin];
float a_key1 = keys_shared1[a_begin];
float b_key1 = keys_shared1[b_begin];
bool p;
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[0] = p ? a_key0 : b_key0;
results1[0] = p ? a_key1 : b_key1;
indices[0] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[1] = p ? a_key0 : b_key0;
results1[1] = p ? a_key1 : b_key1;
indices[1] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[2] = p ? a_key0 : b_key0;
results1[2] = p ? a_key1 : b_key1;
indices[2] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[3] = p ? a_key0 : b_key0;
results1[3] = p ? a_key1 : b_key1;
indices[3] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[4] = p ? a_key0 : b_key0;
results1[4] = p ? a_key1 : b_key1;
indices[4] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[5] = p ? a_key0 : b_key0;
results1[5] = p ? a_key1 : b_key1;
indices[5] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[6] = p ? a_key0 : b_key0;
results1[6] = p ? a_key1 : b_key1;
indices[6] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[7] = p ? a_key0 : b_key0;
results1[7] = p ? a_key1 : b_key1;
indices[7] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[8] = p ? a_key0 : b_key0;
results1[8] = p ? a_key1 : b_key1;
indices[8] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[9] = p ? a_key0 : b_key0;
results1[9] = p ? a_key1 : b_key1;
indices[9] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1));
results0[10] = p ? a_key0 : b_key0;
results1[10] = p ? a_key1 : b_key1;
indices[10] = p ? a_begin : b_begin;
if(p)
{
++a_begin;
a_key0 = keys_shared0[a_begin];
a_key1 = keys_shared1[a_begin];
}
else
{
++b_begin;
b_key0 = keys_shared0[b_begin];
b_key1 = keys_shared1[b_begin];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
int merge_path_int_float
(
int a_count,
int b_count,
int diag,
local const int * a0,
local const float * a1,
local const int * b0,
local const float * b1
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
void load2_to_regstr_1_11_11_float
(
global const float * a_global,
int a_count,
global const float * b_global,
int b_count,
int tid,
float * reg
)
{
b_global -= a_count;
int total = a_count + b_count;
int index;
if (total >= 11)
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else reg[6] = b_global[index];
index = 7 + tid;
if (index < a_count) reg[7] = a_global[index];
else reg[7] = b_global[index];
index = 8 + tid;
if (index < a_count) reg[8] = a_global[index];
else reg[8] = b_global[index];
index = 9 + tid;
if (index < a_count) reg[9] = a_global[index];
else reg[9] = b_global[index];
index = 10 + tid;
if (index < a_count) reg[10] = a_global[index];
else reg[10] = b_global[index];
}
else
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else if (index < total) reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else if (index < total) reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else if (index < total) reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else if (index < total) reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else if (index < total) reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else if (index < total) reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else if (index < total) reg[6] = b_global[index];
index = 7 + tid;
if (index < a_count) reg[7] = a_global[index];
else if (index < total) reg[7] = b_global[index];
index = 8 + tid;
if (index < a_count) reg[8] = a_global[index];
else if (index < total) reg[8] = b_global[index];
index = 9 + tid;
if (index < a_count) reg[9] = a_global[index];
else if (index < total) reg[9] = b_global[index];
index = 10 + tid;
if (index < a_count) reg[10] = a_global[index];
else if (index < total) reg[10] = b_global[index];
}
}
void load2_to_shared_1_11_11_float
(
global const float * a_global,
int a_count,
global const float * b_global,
int b_count,
int tid,
local float * shared
)
{
float reg[11];
load2_to_regstr_1_11_11_float(a_global, a_count, b_global, b_count, tid, reg);
regstr_to_shared_1_11_float(reg, tid, shared);
}
void load2_to_regstr_1_11_11_int
(
global const int * a_global,
int a_count,
global const int * b_global,
int b_count,
int tid,
int * reg
)
{
b_global -= a_count;
int total = a_count + b_count;
int index;
if (total >= 11)
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else reg[6] = b_global[index];
index = 7 + tid;
if (index < a_count) reg[7] = a_global[index];
else reg[7] = b_global[index];
index = 8 + tid;
if (index < a_count) reg[8] = a_global[index];
else reg[8] = b_global[index];
index = 9 + tid;
if (index < a_count) reg[9] = a_global[index];
else reg[9] = b_global[index];
index = 10 + tid;
if (index < a_count) reg[10] = a_global[index];
else reg[10] = b_global[index];
}
else
{
index = 0 + tid;
if (index < a_count) reg[0] = a_global[index];
else if (index < total) reg[0] = b_global[index];
index = 1 + tid;
if (index < a_count) reg[1] = a_global[index];
else if (index < total) reg[1] = b_global[index];
index = 2 + tid;
if (index < a_count) reg[2] = a_global[index];
else if (index < total) reg[2] = b_global[index];
index = 3 + tid;
if (index < a_count) reg[3] = a_global[index];
else if (index < total) reg[3] = b_global[index];
index = 4 + tid;
if (index < a_count) reg[4] = a_global[index];
else if (index < total) reg[4] = b_global[index];
index = 5 + tid;
if (index < a_count) reg[5] = a_global[index];
else if (index < total) reg[5] = b_global[index];
index = 6 + tid;
if (index < a_count) reg[6] = a_global[index];
else if (index < total) reg[6] = b_global[index];
index = 7 + tid;
if (index < a_count) reg[7] = a_global[index];
else if (index < total) reg[7] = b_global[index];
index = 8 + tid;
if (index < a_count) reg[8] = a_global[index];
else if (index < total) reg[8] = b_global[index];
index = 9 + tid;
if (index < a_count) reg[9] = a_global[index];
else if (index < total) reg[9] = b_global[index];
index = 10 + tid;
if (index < a_count) reg[10] = a_global[index];
else if (index < total) reg[10] = b_global[index];
}
}
void load2_to_shared_1_11_11_int
(
global const int * a_global,
int a_count,
global const int * b_global,
int b_count,
int tid,
local int * shared
)
{
int reg[11];
load2_to_regstr_1_11_11_int(a_global, a_count, b_global, b_count, tid, reg);
regstr_to_shared_1_11_int(reg, tid, shared);
}
void merge_keys_indices_1_11_int_float
(
int a_count,
int b_count,
int4 range,
int tid,
int * indices,
global const int * a_global0,
global const float * a_global1,
global const int * b_global0,
global const float * b_global1,
local int * keys_shared0,
local float * keys_shared1,
int * results0,
float * results1
)
{
int a0 = range.x;
int a1 = range.y;
int b0 = range.z;
int b1 = range.w;
a_count = a1 - a0;
b_count = b1 - b0;
load2_to_shared_1_11_11_int(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0);
load2_to_shared_1_11_11_float(a_global1 + a0, a_count, b_global1 + b0, b_count, tid, keys_shared1);
int diag = 11 * tid;
int mp = merge_path_int_float(a_count, b_count, diag, keys_shared0, keys_shared1, keys_shared0 + a_count, keys_shared1 + a_count);
int a0tid = mp;
int a1tid = a_count;
int b0tid = a_count + diag - mp;
int b1tid = a_count + b_count;
serial_merge_11_int_float(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, keys_shared1, results0, results1);
}
void transfer_merge_values_regstr_1_11_long_short
(
int count,
int b_start,
const int * indices,
int tid,
global const long * a_global0,
global const short * a_global1,
global const long * b_global0,
global const short * b_global1,
long * reg0,
short * reg1
)
{
b_global0 -= b_start;
b_global1 -= b_start;
if(count >= 11)
{
if (indices[0] < b_start)
{
reg0[0] = a_global0[indices[0]];
reg1[0] = a_global1[indices[0]];
}
else
{
reg0[0] = b_global0[indices[0]];
reg1[0] = b_global1[indices[0]];
}
if (indices[1] < b_start)
{
reg0[1] = a_global0[indices[1]];
reg1[1] = a_global1[indices[1]];
}
else
{
reg0[1] = b_global0[indices[1]];
reg1[1] = b_global1[indices[1]];
}
if (indices[2] < b_start)
{
reg0[2] = a_global0[indices[2]];
reg1[2] = a_global1[indices[2]];
}
else
{
reg0[2] = b_global0[indices[2]];
reg1[2] = b_global1[indices[2]];
}
if (indices[3] < b_start)
{
reg0[3] = a_global0[indices[3]];
reg1[3] = a_global1[indices[3]];
}
else
{
reg0[3] = b_global0[indices[3]];
reg1[3] = b_global1[indices[3]];
}
if (indices[4] < b_start)
{
reg0[4] = a_global0[indices[4]];
reg1[4] = a_global1[indices[4]];
}
else
{
reg0[4] = b_global0[indices[4]];
reg1[4] = b_global1[indices[4]];
}
if (indices[5] < b_start)
{
reg0[5] = a_global0[indices[5]];
reg1[5] = a_global1[indices[5]];
}
else
{
reg0[5] = b_global0[indices[5]];
reg1[5] = b_global1[indices[5]];
}
if (indices[6] < b_start)
{
reg0[6] = a_global0[indices[6]];
reg1[6] = a_global1[indices[6]];
}
else
{
reg0[6] = b_global0[indices[6]];
reg1[6] = b_global1[indices[6]];
}
if (indices[7] < b_start)
{
reg0[7] = a_global0[indices[7]];
reg1[7] = a_global1[indices[7]];
}
else
{
reg0[7] = b_global0[indices[7]];
reg1[7] = b_global1[indices[7]];
}
if (indices[8] < b_start)
{
reg0[8] = a_global0[indices[8]];
reg1[8] = a_global1[indices[8]];
}
else
{
reg0[8] = b_global0[indices[8]];
reg1[8] = b_global1[indices[8]];
}
if (indices[9] < b_start)
{
reg0[9] = a_global0[indices[9]];
reg1[9] = a_global1[indices[9]];
}
else
{
reg0[9] = b_global0[indices[9]];
reg1[9] = b_global1[indices[9]];
}
if (indices[10] < b_start)
{
reg0[10] = a_global0[indices[10]];
reg1[10] = a_global1[indices[10]];
}
else
{
reg0[10] = b_global0[indices[10]];
reg1[10] = b_global1[indices[10]];
}
}
else
{
int index;
index = 0 + tid;
if(index < count)
{
if (indices[0] < b_start)
{
reg0[0] = a_global0[indices[0]];
reg1[0] = a_global1[indices[0]];
}
else
{
reg0[0] = b_global0[indices[0]];
reg1[0] = b_global1[indices[0]];
}
}
index = 1 + tid;
if(index < count)
{
if (indices[1] < b_start)
{
reg0[1] = a_global0[indices[1]];
reg1[1] = a_global1[indices[1]];
}
else
{
reg0[1] = b_global0[indices[1]];
reg1[1] = b_global1[indices[1]];
}
}
index = 2 + tid;
if(index < count)
{
if (indices[2] < b_start)
{
reg0[2] = a_global0[indices[2]];
reg1[2] = a_global1[indices[2]];
}
else
{
reg0[2] = b_global0[indices[2]];
reg1[2] = b_global1[indices[2]];
}
}
index = 3 + tid;
if(index < count)
{
if (indices[3] < b_start)
{
reg0[3] = a_global0[indices[3]];
reg1[3] = a_global1[indices[3]];
}
else
{
reg0[3] = b_global0[indices[3]];
reg1[3] = b_global1[indices[3]];
}
}
index = 4 + tid;
if(index < count)
{
if (indices[4] < b_start)
{
reg0[4] = a_global0[indices[4]];
reg1[4] = a_global1[indices[4]];
}
else
{
reg0[4] = b_global0[indices[4]];
reg1[4] = b_global1[indices[4]];
}
}
index = 5 + tid;
if(index < count)
{
if (indices[5] < b_start)
{
reg0[5] = a_global0[indices[5]];
reg1[5] = a_global1[indices[5]];
}
else
{
reg0[5] = b_global0[indices[5]];
reg1[5] = b_global1[indices[5]];
}
}
index = 6 + tid;
if(index < count)
{
if (indices[6] < b_start)
{
reg0[6] = a_global0[indices[6]];
reg1[6] = a_global1[indices[6]];
}
else
{
reg0[6] = b_global0[indices[6]];
reg1[6] = b_global1[indices[6]];
}
}
index = 7 + tid;
if(index < count)
{
if (indices[7] < b_start)
{
reg0[7] = a_global0[indices[7]];
reg1[7] = a_global1[indices[7]];
}
else
{
reg0[7] = b_global0[indices[7]];
reg1[7] = b_global1[indices[7]];
}
}
index = 8 + tid;
if(index < count)
{
if (indices[8] < b_start)
{
reg0[8] = a_global0[indices[8]];
reg1[8] = a_global1[indices[8]];
}
else
{
reg0[8] = b_global0[indices[8]];
reg1[8] = b_global1[indices[8]];
}
}
index = 9 + tid;
if(index < count)
{
if (indices[9] < b_start)
{
reg0[9] = a_global0[indices[9]];
reg1[9] = a_global1[indices[9]];
}
else
{
reg0[9] = b_global0[indices[9]];
reg1[9] = b_global1[indices[9]];
}
}
index = 10 + tid;
if(index < count)
{
if (indices[10] < b_start)
{
reg0[10] = a_global0[indices[10]];
reg1[10] = a_global1[indices[10]];
}
else
{
reg0[10] = b_global0[indices[10]];
reg1[10] = b_global1[indices[10]];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void transfer_merge_values_shared_1_11_long_short
(
int count,
int b_start,
local const int * indices_shared,
int tid,
global const long * a_global0,
global const short * a_global1,
global const long * b_global0,
global const short * b_global1,
global long * dest_global0,
global short * dest_global1
)
{
int indices[11];
shared_to_regstr_1_11_int(indices_shared, tid, indices);
long reg0[11];
short reg1[11];
transfer_merge_values_regstr_1_11_long_short(count, b_start, indices, tid, a_global0, a_global1, b_global0, b_global1, reg0, reg1);
regstr_to_global_1_11_long(count, reg0, tid, dest_global0);
regstr_to_global_1_11_short(count, reg1, tid, dest_global1);
}
void device_merge_1_11_int_float_long_short
(
int a_count,
int b_count,
global const int * a_keys_global0,
global const float * a_keys_global1,
global const int * b_keys_global0,
global const float * b_keys_global1,
global int * keys_global0,
global float * keys_global1,
local int * keys_shared0,
local float * keys_shared1,
global const long * a_vals_global0,
global const short * a_vals_global1,
global const long * b_vals_global0,
global const short * b_vals_global1,
global long * vals_global0,
global short * vals_global1,
int tid,
int block,
int4 range,
local int * indices_shared
)
{
int results0[11];
float results1[11];
int indices[11];
merge_keys_indices_1_11_int_float(a_count, b_count, range, tid, indices, a_keys_global0, a_keys_global1, b_keys_global0, b_keys_global1, keys_shared0, keys_shared1, results0, results1);
thread_to_shared_11_int(results0, tid, keys_shared0);
thread_to_shared_11_float(results1, tid, keys_shared1);
a_count = range.y - range.x;
b_count = range.w - range.z;
shared_to_global_1_11_int(a_count + b_count, keys_shared0, tid, keys_global0 + 11 * block);
shared_to_global_1_11_float(a_count + b_count, keys_shared1, tid, keys_global1 + 11 * block);
thread_to_shared_11_int(indices, tid, indices_shared);
transfer_merge_values_shared_1_11_long_short(a_count + b_count, a_count, indices_shared, tid, a_vals_global0 + range.x, a_vals_global1 + range.x, b_vals_global0 + range.z, b_vals_global1 + range.z, vals_global0 + 11 * block, vals_global1 + 11 * block);
}
kernel void merge
(
int a_count,
int b_count,
global const int * a_keys_global0,
global const float * a_keys_global1,
global const int * b_keys_global0,
global const float * b_keys_global1,
global int * keys_global0,
global float * keys_global1,
global const long * a_vals_global0,
global const short * a_vals_global1,
global const long * b_vals_global0,
global const short * b_vals_global1,
global long * vals_global0,
global short * vals_global1,
global const int * mp_global,
int coop
)
{
union Shared
{
struct
{
int keys0[12];
float keys1[12];
};
int indices[11];
};
local union Shared shared;
int tid = get_local_id(0);
int block = get_group_id(0);
int4 range = compute_merge_range(a_count, b_count, block, coop, 11, mp_global);
device_merge_1_11_int_float_long_short(a_count, b_count, a_keys_global0, a_keys_global1, b_keys_global0, b_keys_global1, keys_global0, keys_global1, shared.keys0, shared.keys1, a_vals_global0, a_vals_global1, b_vals_global0, b_vals_global1, vals_global0, vals_global1, tid, block, range, shared.indices);
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
float a2,
int b1,
float b2
)
{
return (a1 == b1) ? (a2 < b2) : (a1 < b1);
}
int merge_path_int_float
(
int a_count,
int b_count,
int diag,
global const int * a0,
global const float * a1,
global const int * b0,
global const float * b1
)
{
int begin = max(0, diag - b_count);
int end = min(diag, a_count);
while (begin < end)
{
int mid = (begin + end) >> 1;
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1;
else end = mid;
}
return begin;
}
int4 find_mergesort_frame
(
int coop,
int block,
int nv
)
{
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
int4 frame;
frame.x = nv * start;
frame.y = nv * start + size;
frame.z = size;
return frame;
}
kernel void merge_partition
(
int a_count,
int b_count,
int nv,
int coop,
global int * mp_global,
int num_searches,
global const int * a_global0,
global const float * a_global1,
global const int * b_global0,
global const float * b_global1
)
{
int partition = get_global_id(0);
if (partition < num_searches)
{
int a0 = 0, b0 = 0;
int gid = nv * partition;
if(coop)
{
int4 frame = find_mergesort_frame(coop, partition, nv);
a0 = frame.x;
b0 = min(a_count, frame.y);
b_count = min(a_count, frame.y + frame.z) - b0;
a_count = min(a_count, frame.x + frame.z) - a0;
gid -= a0;
}
int mp = merge_path_int_float(a_count, b_count, min(gid, a_count + b_count), a_global0 + a0, a_global1 + a0, b_global0 + b0, b_global1 + b0);
mp_global[partition] = mp;
}
}
*** No errors detected
<end of output>
Test time = 13.27 sec
----------------------------------------------------------
Test Passed.
"sort" end time: Jan 30 13:43 IST
"sort" time elapsed: 00:00:13
----------------------------------------------------------
22/30 Testing: scan
22/30 Test: scan
Command: "/tmp/vexcl/build/tests/scan"
Directory: /tmp/vexcl/build/tests
"scan" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605613
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 3 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_inclusive_scan
(
ulong n,
global const int * input,
int identity,
global int * scan_buf1,
global int * scan_buf2,
int exclusive
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
size_t offset = 1;
local int shared[2];
if(block * 2 + l_id < n) shared[l_id] = input[block * 2 + l_id];
if(block * 2 + l_id + 1 < n) shared[l_id + 1] = input[block * 2 + l_id + 1];
if(exclusive && g_id == 0) shared[l_id] = oper(identity, input[0]);
for (size_t start = 1; start > 0; start >>= 1, offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id < start)
{
size_t temp1 = offset * (2 * l_id + 1) - 1;
size_t temp2 = offset * (2 * l_id + 2) - 1;
int y2 = shared[temp2];
int y1 = shared[temp1];
shared[temp2] = oper(y2, y1);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id == 0)
{
scan_buf1[ block ] = shared[1];
scan_buf2[ block ] = shared[0];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void intra_block_inclusive_scan
(
ulong n,
global int * post_sum,
global const int * pre_sum,
int identity,
uint work_per_thread
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t map_id = g_id * work_per_thread;
local int shared[1];
size_t offset;
int work_sum;
if (map_id < n)
{
offset = 0;
work_sum = pre_sum[map_id];
for( offset = 1; offset < work_per_thread; ++offset )
{
if (map_id + offset < n) work_sum = oper( work_sum, pre_sum[map_id + offset] );
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int scan_sum = work_sum;
shared[ l_id ] = work_sum;
for( offset = 1; offset < 1; offset *= 2 )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n && l_id >= offset) scan_sum = oper( scan_sum, shared[ l_id - offset ] );
barrier(CLK_LOCAL_MEM_FENCE);
shared[ l_id ] = scan_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
work_sum = pre_sum[map_id];
if (l_id > 0)
{
work_sum = oper(work_sum, shared[l_id - 1]);
post_sum[map_id] = work_sum;
}
else post_sum[map_id] = work_sum;
for( offset = 1; offset < work_per_thread; ++offset )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n && l_id > 0)
{
int y = oper(pre_sum[map_id + offset], work_sum);
post_sum[ map_id + offset ] = y;
work_sum = y;
}
else
{
post_sum[map_id + offset] = oper(pre_sum[map_id + offset], work_sum);
work_sum = post_sum[map_id + offset];
}
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_addition
(
ulong n,
global const int * input,
global int * output,
global int * post_sum,
global int * pre_sum,
int identity,
int exclusive
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
int val;
local int shared[1];
if (g_id < n)
{
if (exclusive) val = g_id > 0 ? input[g_id - 1] : identity;
else val = input[g_id];
}
shared[l_id] = val;
int scan_result = val;
int post_block_sum, new_result;
int y1, y2, sum;
if(l_id == 0 && g_id < n)
{
if(block > 0)
{
if(block % 2 == 0) post_block_sum = post_sum[ block/2 - 1 ];
else if(block == 1) post_block_sum = pre_sum[0];
else
{
y1 = post_sum[ block/2 - 1 ];
y2 = pre_sum [ block/2];
post_block_sum = oper(y1, y2);
}
new_result = exclusive ? post_block_sum : oper( scan_result, post_block_sum );
}
else new_result = scan_result;
shared[ l_id ] = new_result;
}
sum = shared[ l_id ];
for( size_t offset = 1; offset < 1; offset *= 2 )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id >= offset) sum = oper( sum, shared[ l_id - offset ] );
barrier(CLK_LOCAL_MEM_FENCE);
shared[ l_id ] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(g_id < n) output[ g_id ] = sum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int device
(
int x,
int y
)
{
return x + y;
}
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
global int * prm_2,
int prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = device( prm_2[idx], prm_3 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double oper
(
double x,
double y
)
{
return x + y;
}
kernel void block_inclusive_scan
(
ulong n,
global const double * input,
double identity,
global double * scan_buf1,
global double * scan_buf2,
int exclusive
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
size_t offset = 1;
local double shared[2];
if(block * 2 + l_id < n) shared[l_id] = input[block * 2 + l_id];
if(block * 2 + l_id + 1 < n) shared[l_id + 1] = input[block * 2 + l_id + 1];
if(exclusive && g_id == 0) shared[l_id] = oper(identity, input[0]);
for (size_t start = 1; start > 0; start >>= 1, offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id < start)
{
size_t temp1 = offset * (2 * l_id + 1) - 1;
size_t temp2 = offset * (2 * l_id + 2) - 1;
double y2 = shared[temp2];
double y1 = shared[temp1];
shared[temp2] = oper(y2, y1);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id == 0)
{
scan_buf1[ block ] = shared[1];
scan_buf2[ block ] = shared[0];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double oper
(
double x,
double y
)
{
return x + y;
}
kernel void intra_block_inclusive_scan
(
ulong n,
global double * post_sum,
global const double * pre_sum,
double identity,
uint work_per_thread
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t map_id = g_id * work_per_thread;
local double shared[1];
size_t offset;
double work_sum;
if (map_id < n)
{
offset = 0;
work_sum = pre_sum[map_id];
for( offset = 1; offset < work_per_thread; ++offset )
{
if (map_id + offset < n) work_sum = oper( work_sum, pre_sum[map_id + offset] );
}
}
barrier(CLK_LOCAL_MEM_FENCE);
double scan_sum = work_sum;
shared[ l_id ] = work_sum;
for( offset = 1; offset < 1; offset *= 2 )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n && l_id >= offset) scan_sum = oper( scan_sum, shared[ l_id - offset ] );
barrier(CLK_LOCAL_MEM_FENCE);
shared[ l_id ] = scan_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
work_sum = pre_sum[map_id];
if (l_id > 0)
{
work_sum = oper(work_sum, shared[l_id - 1]);
post_sum[map_id] = work_sum;
}
else post_sum[map_id] = work_sum;
for( offset = 1; offset < work_per_thread; ++offset )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n && l_id > 0)
{
double y = oper(pre_sum[map_id + offset], work_sum);
post_sum[ map_id + offset ] = y;
work_sum = y;
}
else
{
post_sum[map_id + offset] = oper(pre_sum[map_id + offset], work_sum);
work_sum = post_sum[map_id + offset];
}
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double oper
(
double x,
double y
)
{
return x + y;
}
kernel void block_addition
(
ulong n,
global const double * input,
global double * output,
global double * post_sum,
global double * pre_sum,
double identity,
int exclusive
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
double val;
local double shared[1];
if (g_id < n)
{
if (exclusive) val = g_id > 0 ? input[g_id - 1] : identity;
else val = input[g_id];
}
shared[l_id] = val;
double scan_result = val;
double post_block_sum, new_result;
double y1, y2, sum;
if(l_id == 0 && g_id < n)
{
if(block > 0)
{
if(block % 2 == 0) post_block_sum = post_sum[ block/2 - 1 ];
else if(block == 1) post_block_sum = pre_sum[0];
else
{
y1 = post_sum[ block/2 - 1 ];
y2 = pre_sum [ block/2];
post_block_sum = oper(y1, y2);
}
new_result = exclusive ? post_block_sum : oper( scan_result, post_block_sum );
}
else new_result = scan_result;
shared[ l_id ] = new_result;
}
sum = shared[ l_id ];
for( size_t offset = 1; offset < 1; offset *= 2 )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id >= offset) sum = oper( sum, shared[ l_id - offset ] );
barrier(CLK_LOCAL_MEM_FENCE);
shared[ l_id ] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(g_id < n) output[ g_id ] = sum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double device
(
double x,
double y
)
{
return x + y;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
global double * prm_2,
double prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = device( prm_2[idx], prm_3 );
}
}
*** No errors detected
<end of output>
Test time = 0.30 sec
----------------------------------------------------------
Test Passed.
"scan" end time: Jan 30 13:43 IST
"scan" time elapsed: 00:00:00
----------------------------------------------------------
23/30 Testing: scan_by_key
23/30 Test: scan_by_key
Command: "/tmp/vexcl/build/tests/scan_by_key"
Directory: /tmp/vexcl/build/tests
"scan_by_key" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605613
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 3 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int x,
int y
)
{
return x == y;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_scan_by_key
(
ulong n,
global const int * ivals,
global int * ovals1,
global int * ovals2,
global const int * ikeys0,
global int * okeys0
)
{
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t block = get_group_id(0);
size_t offset = 1;
size_t pos = block * 2 + l_id;
struct Shared
{
int vals[2];
int keys0[2];
};
local struct Shared shared;
if (pos < n)
{
shared.vals[l_id] = ivals[pos];
shared.keys0[l_id] = ikeys0[pos];
}
if (pos + 1 < n)
{
shared.vals[l_id + 1] = ivals[pos + 1];
shared.keys0[l_id + 1] = ikeys0[pos + 1];
}
for(size_t start = 1; start > 0; start /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id < start)
{
size_t temp1 = offset * (2 * l_id + 1) - 1;
size_t temp2 = offset * (2 * l_id + 2) - 1;
int key10 = shared.keys0[temp1];
int key20 = shared.keys0[temp2];
if (comp(key20, key10))
{
shared.vals[temp2] = oper(shared.vals[temp2], shared.vals[temp1]);
}
}
offset *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id == 0)
{
okeys0[block] = shared.keys0[1];
ovals1[block] = shared.vals[1];
ovals2[block] = shared.vals[0];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int x,
int y
)
{
return x == y;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_inclusive_scan_by_key
(
ulong n,
global int * pre_sum,
uint work_per_thread,
global const int * key_sum0
)
{
size_t block = get_group_id(0);
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t map_id = g_id * work_per_thread;
struct Shared
{
int vals[1];
int keys0[1];
};
local struct Shared shared;
uint offset;
int key0;
int work_sum;
if (map_id < n)
{
int prev_key0;
offset = 0;
key0 = key_sum0[map_id];
work_sum = pre_sum[map_id];
for(offset = 1; offset < work_per_thread; ++offset)
{
prev_key0 = key0;
key0 = key_sum0[map_id + offset];
if (map_id + offset < n)
{
if (comp(key0, prev_key0)) work_sum = oper(work_sum, pre_sum[map_id + offset]);
else work_sum = pre_sum[map_id + offset];
pre_sum[map_id + offset] = work_sum;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int scan_sum = work_sum;
shared.vals[l_id] = work_sum;
shared.keys0[l_id] = key0;
for(offset = 1; offset < 1; offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n)
{
if (l_id >= offset)
{
int key10 = shared.keys0[l_id];
int key20 = shared.keys0[l_id - offset];
if (comp(key10, key20)) scan_sum = oper(scan_sum, shared.vals[l_id - offset]);
else scan_sum = shared.vals[l_id];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[l_id] = scan_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
for(offset = 0; offset < work_per_thread; ++offset)
{
barrier(CLK_GLOBAL_MEM_FENCE);
if (map_id < n && l_id > 0)
{
int y = pre_sum[map_id + offset];
int key10 = key_sum0[map_id + offset];
int key20 = shared.keys0[l_id - 1];
if (comp(key10, key20)) y = oper(y, shared.vals[l_id - 1]);
pre_sum[map_id + offset] = y;
}
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int x,
int y
)
{
return x == y;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_add_by_key
(
ulong n,
global const int * pre_sum,
global const int * pre_sum1,
global const int * ivals,
global int * ovals,
global const int * ikeys0
)
{
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t block = get_group_id(0);
struct Shared
{
int vals[1];
int keys0[1];
};
local struct Shared shared;
int val;
int key0;
if (g_id < n)
{
shared.vals[l_id] =val = ivals[g_id];
shared.keys0[l_id] = key0 = ikeys0[g_id];
}
int scan_result = shared.vals[l_id];
int post_sum, new_result, sum;
int key10, key20, key30, key40;
if (l_id == 0 && g_id < n)
{
if (block > 0)
{
key10 = ikeys0[g_id];
key20 = ikeys0[block * 1 - 1];
if (block % 2 == 0) post_sum = pre_sum[block / 2 - 1];
else if (block == 1) post_sum = pre_sum1[0];
else
{
key30 = ikeys0[block * 1 - 1];
key40 = ikeys0[(block - 1) * 1 - 1];
if (comp(key30, key40)) post_sum = oper(pre_sum[block / 2 - 1], pre_sum1[block / 2]);
else post_sum = pre_sum1[block / 2];
}
if (comp(key10, key20)) new_result = oper(scan_result, post_sum);
else new_result = scan_result;
}
else new_result = scan_result;
shared.vals[l_id] = new_result;
}
sum = shared.vals[l_id];
for(size_t offset = 1; offset < 1; offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id >= offset)
{
key20 = shared.keys0[l_id - offset];
if (comp(key0, key20)) sum = oper(sum, shared.vals[l_id - offset]);
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[l_id] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (g_id < n) ovals[g_id] = sum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int x,
int y
)
{
return x == y;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_scan_by_key
(
ulong n,
global const int * ivals,
global int * ovals1,
global int * ovals2,
global const int * ikeys0,
global int * okeys0,
int init
)
{
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t block = get_group_id(0);
size_t offset = 1;
size_t pos = block * 2 + l_id;
struct Shared
{
int vals[2];
int keys0[2];
};
local struct Shared shared;
if (g_id > 0 && pos < n)
{
int key10 = ikeys0[pos];
int key20 = ikeys0[pos - 1];
if (comp(key10, key20))
{
shared.vals[l_id] = ivals[pos];
}
else
{
shared.vals[l_id] = oper(init, ivals[pos]);
}
shared.keys0[l_id] = ikeys0[pos];
}
else
{
shared.vals[l_id] = oper(init, ivals[0]);
shared.keys0[l_id] = ikeys0[0];
}
if (pos + 1 < n)
{
int key10 = ikeys0[pos + 1];
int key20 = ikeys0[pos + 1 - 1];
if (comp(key10, key20))
{
shared.vals[l_id + 1] = ivals[pos + 1];
}
else
{
shared.vals[l_id + 1] = oper(init, ivals[pos + 1]);
}
shared.keys0[l_id + 1] = ikeys0[pos + 1];
}
for(size_t start = 1; start > 0; start /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id < start)
{
size_t temp1 = offset * (2 * l_id + 1) - 1;
size_t temp2 = offset * (2 * l_id + 2) - 1;
int key10 = shared.keys0[temp1];
int key20 = shared.keys0[temp2];
if (comp(key20, key10))
{
shared.vals[temp2] = oper(shared.vals[temp2], shared.vals[temp1]);
}
}
offset *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id == 0)
{
okeys0[block] = shared.keys0[1];
ovals1[block] = shared.vals[1];
ovals2[block] = shared.vals[0];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int x,
int y
)
{
return x == y;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_inclusive_scan_by_key
(
ulong n,
global int * pre_sum,
uint work_per_thread,
global const int * key_sum0
)
{
size_t block = get_group_id(0);
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t map_id = g_id * work_per_thread;
struct Shared
{
int vals[1];
int keys0[1];
};
local struct Shared shared;
uint offset;
int key0;
int work_sum;
if (map_id < n)
{
int prev_key0;
offset = 0;
key0 = key_sum0[map_id];
work_sum = pre_sum[map_id];
for(offset = 1; offset < work_per_thread; ++offset)
{
prev_key0 = key0;
key0 = key_sum0[map_id + offset];
if (map_id + offset < n)
{
if (comp(key0, prev_key0)) work_sum = oper(work_sum, pre_sum[map_id + offset]);
else work_sum = pre_sum[map_id + offset];
pre_sum[map_id + offset] = work_sum;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int scan_sum = work_sum;
shared.vals[l_id] = work_sum;
shared.keys0[l_id] = key0;
for(offset = 1; offset < 1; offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n)
{
if (l_id >= offset)
{
int key10 = shared.keys0[l_id];
int key20 = shared.keys0[l_id - offset];
if (comp(key10, key20)) scan_sum = oper(scan_sum, shared.vals[l_id - offset]);
else scan_sum = shared.vals[l_id];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[l_id] = scan_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
for(offset = 0; offset < work_per_thread; ++offset)
{
barrier(CLK_GLOBAL_MEM_FENCE);
if (map_id < n && l_id > 0)
{
int y = pre_sum[map_id + offset];
int key10 = key_sum0[map_id + offset];
int key20 = shared.keys0[l_id - 1];
if (comp(key10, key20)) y = oper(y, shared.vals[l_id - 1]);
pre_sum[map_id + offset] = y;
}
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int x,
int y
)
{
return x == y;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_add_by_key
(
ulong n,
global const int * pre_sum,
global const int * pre_sum1,
global const int * ivals,
global int * ovals,
global const int * ikeys0,
int init
)
{
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t block = get_group_id(0);
struct Shared
{
int vals[1];
int keys0[1];
};
local struct Shared shared;
int val;
int key0;
if (g_id < n)
{
if (g_id > 0)
{
int key10 = key0 = ikeys0[g_id];
int key20 = ikeys0[g_id-1];
if (comp(key10, key20)) val = ivals[g_id - 1];
else val = init;
shared.vals[l_id] = val;
shared.keys0[l_id] = key0;
}
else
{
val = init;
shared.vals[l_id] = val;
shared.keys0[l_id] = ikeys0[g_id];
}
}
int scan_result = shared.vals[l_id];
int post_sum, new_result, sum;
int key10, key20, key30, key40;
if (l_id == 0 && g_id < n)
{
if (block > 0)
{
key10 = ikeys0[g_id];
key20 = ikeys0[block * 1 - 1];
if (block % 2 == 0) post_sum = pre_sum[block / 2 - 1];
else if (block == 1) post_sum = pre_sum1[0];
else
{
key30 = ikeys0[block * 1 - 1];
key40 = ikeys0[(block - 1) * 1 - 1];
if (comp(key30, key40)) post_sum = oper(pre_sum[block / 2 - 1], pre_sum1[block / 2]);
else post_sum = pre_sum1[block / 2];
}
if (comp(key10, key20)) new_result = post_sum;
else new_result = init;
}
else new_result = scan_result;
shared.vals[l_id] = new_result;
}
sum = shared.vals[l_id];
for(size_t offset = 1; offset < 1; offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id >= offset)
{
key20 = shared.keys0[l_id - offset];
if (comp(key0, key20)) sum = oper(sum, shared.vals[l_id - offset]);
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[l_id] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (g_id < n) ovals[g_id] = sum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
int a2,
int b1,
int b2
)
{
return a1 == b1 && a2 == b2;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_scan_by_key
(
ulong n,
global const int * ivals,
global int * ovals1,
global int * ovals2,
global const int * ikeys0,
global const int * ikeys1,
global int * okeys0,
global int * okeys1
)
{
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t block = get_group_id(0);
size_t offset = 1;
size_t pos = block * 2 + l_id;
struct Shared
{
int vals[2];
int keys0[2];
int keys1[2];
};
local struct Shared shared;
if (pos < n)
{
shared.vals[l_id] = ivals[pos];
shared.keys0[l_id] = ikeys0[pos];
shared.keys1[l_id] = ikeys1[pos];
}
if (pos + 1 < n)
{
shared.vals[l_id + 1] = ivals[pos + 1];
shared.keys0[l_id + 1] = ikeys0[pos + 1];
shared.keys1[l_id + 1] = ikeys1[pos + 1];
}
for(size_t start = 1; start > 0; start /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id < start)
{
size_t temp1 = offset * (2 * l_id + 1) - 1;
size_t temp2 = offset * (2 * l_id + 2) - 1;
int key10 = shared.keys0[temp1];
int key20 = shared.keys0[temp2];
int key11 = shared.keys1[temp1];
int key21 = shared.keys1[temp2];
if (comp(key20, key21, key10, key11))
{
shared.vals[temp2] = oper(shared.vals[temp2], shared.vals[temp1]);
}
}
offset *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id == 0)
{
okeys0[block] = shared.keys0[1];
okeys1[block] = shared.keys1[1];
ovals1[block] = shared.vals[1];
ovals2[block] = shared.vals[0];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
int a2,
int b1,
int b2
)
{
return a1 == b1 && a2 == b2;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_inclusive_scan_by_key
(
ulong n,
global int * pre_sum,
uint work_per_thread,
global const int * key_sum0,
global const int * key_sum1
)
{
size_t block = get_group_id(0);
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t map_id = g_id * work_per_thread;
struct Shared
{
int vals[1];
int keys0[1];
int keys1[1];
};
local struct Shared shared;
uint offset;
int key0;
int key1;
int work_sum;
if (map_id < n)
{
int prev_key0;
int prev_key1;
offset = 0;
key0 = key_sum0[map_id];
key1 = key_sum1[map_id];
work_sum = pre_sum[map_id];
for(offset = 1; offset < work_per_thread; ++offset)
{
prev_key0 = key0;
key0 = key_sum0[map_id + offset];
prev_key1 = key1;
key1 = key_sum1[map_id + offset];
if (map_id + offset < n)
{
if (comp(key0, key1, prev_key0, prev_key1)) work_sum = oper(work_sum, pre_sum[map_id + offset]);
else work_sum = pre_sum[map_id + offset];
pre_sum[map_id + offset] = work_sum;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int scan_sum = work_sum;
shared.vals[l_id] = work_sum;
shared.keys0[l_id] = key0;
shared.keys1[l_id] = key1;
for(offset = 1; offset < 1; offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n)
{
if (l_id >= offset)
{
int key10 = shared.keys0[l_id];
int key20 = shared.keys0[l_id - offset];
int key11 = shared.keys1[l_id];
int key21 = shared.keys1[l_id - offset];
if (comp(key10, key11, key20, key21)) scan_sum = oper(scan_sum, shared.vals[l_id - offset]);
else scan_sum = shared.vals[l_id];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[l_id] = scan_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
for(offset = 0; offset < work_per_thread; ++offset)
{
barrier(CLK_GLOBAL_MEM_FENCE);
if (map_id < n && l_id > 0)
{
int y = pre_sum[map_id + offset];
int key10 = key_sum0[map_id + offset];
int key20 = shared.keys0[l_id - 1];
int key11 = key_sum1[map_id + offset];
int key21 = shared.keys1[l_id - 1];
if (comp(key10, key11, key20, key21)) y = oper(y, shared.vals[l_id - 1]);
pre_sum[map_id + offset] = y;
}
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
int a2,
int b1,
int b2
)
{
return a1 == b1 && a2 == b2;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_add_by_key
(
ulong n,
global const int * pre_sum,
global const int * pre_sum1,
global const int * ivals,
global int * ovals,
global const int * ikeys0,
global const int * ikeys1
)
{
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t block = get_group_id(0);
struct Shared
{
int vals[1];
int keys0[1];
int keys1[1];
};
local struct Shared shared;
int val;
int key0;
int key1;
if (g_id < n)
{
shared.vals[l_id] =val = ivals[g_id];
shared.keys0[l_id] = key0 = ikeys0[g_id];
shared.keys1[l_id] = key1 = ikeys1[g_id];
}
int scan_result = shared.vals[l_id];
int post_sum, new_result, sum;
int key10, key20, key30, key40;
int key11, key21, key31, key41;
if (l_id == 0 && g_id < n)
{
if (block > 0)
{
key10 = ikeys0[g_id];
key20 = ikeys0[block * 1 - 1];
key11 = ikeys1[g_id];
key21 = ikeys1[block * 1 - 1];
if (block % 2 == 0) post_sum = pre_sum[block / 2 - 1];
else if (block == 1) post_sum = pre_sum1[0];
else
{
key30 = ikeys0[block * 1 - 1];
key40 = ikeys0[(block - 1) * 1 - 1];
key31 = ikeys1[block * 1 - 1];
key41 = ikeys1[(block - 1) * 1 - 1];
if (comp(key30, key31, key40, key41)) post_sum = oper(pre_sum[block / 2 - 1], pre_sum1[block / 2]);
else post_sum = pre_sum1[block / 2];
}
if (comp(key10, key11, key20, key21)) new_result = oper(scan_result, post_sum);
else new_result = scan_result;
}
else new_result = scan_result;
shared.vals[l_id] = new_result;
}
sum = shared.vals[l_id];
for(size_t offset = 1; offset < 1; offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id >= offset)
{
key20 = shared.keys0[l_id - offset];
key21 = shared.keys1[l_id - offset];
if (comp(key0, key1, key20, key21)) sum = oper(sum, shared.vals[l_id - offset]);
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[l_id] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (g_id < n) ovals[g_id] = sum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
int a2,
int b1,
int b2
)
{
return a1 == b1 && a2 == b2;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_scan_by_key
(
ulong n,
global const int * ivals,
global int * ovals1,
global int * ovals2,
global const int * ikeys0,
global const int * ikeys1,
global int * okeys0,
global int * okeys1,
int init
)
{
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t block = get_group_id(0);
size_t offset = 1;
size_t pos = block * 2 + l_id;
struct Shared
{
int vals[2];
int keys0[2];
int keys1[2];
};
local struct Shared shared;
if (g_id > 0 && pos < n)
{
int key10 = ikeys0[pos];
int key20 = ikeys0[pos - 1];
int key11 = ikeys1[pos];
int key21 = ikeys1[pos - 1];
if (comp(key10, key11, key20, key21))
{
shared.vals[l_id] = ivals[pos];
}
else
{
shared.vals[l_id] = oper(init, ivals[pos]);
}
shared.keys0[l_id] = ikeys0[pos];
shared.keys1[l_id] = ikeys1[pos];
}
else
{
shared.vals[l_id] = oper(init, ivals[0]);
shared.keys0[l_id] = ikeys0[0];
shared.keys1[l_id] = ikeys1[0];
}
if (pos + 1 < n)
{
int key10 = ikeys0[pos + 1];
int key20 = ikeys0[pos + 1 - 1];
int key11 = ikeys1[pos + 1];
int key21 = ikeys1[pos + 1 - 1];
if (comp(key10, key11, key20, key21))
{
shared.vals[l_id + 1] = ivals[pos + 1];
}
else
{
shared.vals[l_id + 1] = oper(init, ivals[pos + 1]);
}
shared.keys0[l_id + 1] = ikeys0[pos + 1];
shared.keys1[l_id + 1] = ikeys1[pos + 1];
}
for(size_t start = 1; start > 0; start /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id < start)
{
size_t temp1 = offset * (2 * l_id + 1) - 1;
size_t temp2 = offset * (2 * l_id + 2) - 1;
int key10 = shared.keys0[temp1];
int key20 = shared.keys0[temp2];
int key11 = shared.keys1[temp1];
int key21 = shared.keys1[temp2];
if (comp(key20, key21, key10, key11))
{
shared.vals[temp2] = oper(shared.vals[temp2], shared.vals[temp1]);
}
}
offset *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id == 0)
{
okeys0[block] = shared.keys0[1];
okeys1[block] = shared.keys1[1];
ovals1[block] = shared.vals[1];
ovals2[block] = shared.vals[0];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
int a2,
int b1,
int b2
)
{
return a1 == b1 && a2 == b2;
}
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_add_by_key
(
ulong n,
global const int * pre_sum,
global const int * pre_sum1,
global const int * ivals,
global int * ovals,
global const int * ikeys0,
global const int * ikeys1,
int init
)
{
size_t g_id = get_global_id(0);
size_t l_id = get_local_id(0);
size_t block = get_group_id(0);
struct Shared
{
int vals[1];
int keys0[1];
int keys1[1];
};
local struct Shared shared;
int val;
int key0;
int key1;
if (g_id < n)
{
if (g_id > 0)
{
int key10 = key0 = ikeys0[g_id];
int key20 = ikeys0[g_id-1];
int key11 = key1 = ikeys1[g_id];
int key21 = ikeys1[g_id-1];
if (comp(key10, key11, key20, key21)) val = ivals[g_id - 1];
else val = init;
shared.vals[l_id] = val;
shared.keys0[l_id] = key0;
shared.keys1[l_id] = key1;
}
else
{
val = init;
shared.vals[l_id] = val;
shared.keys0[l_id] = ikeys0[g_id];
shared.keys1[l_id] = ikeys1[g_id];
}
}
int scan_result = shared.vals[l_id];
int post_sum, new_result, sum;
int key10, key20, key30, key40;
int key11, key21, key31, key41;
if (l_id == 0 && g_id < n)
{
if (block > 0)
{
key10 = ikeys0[g_id];
key20 = ikeys0[block * 1 - 1];
key11 = ikeys1[g_id];
key21 = ikeys1[block * 1 - 1];
if (block % 2 == 0) post_sum = pre_sum[block / 2 - 1];
else if (block == 1) post_sum = pre_sum1[0];
else
{
key30 = ikeys0[block * 1 - 1];
key40 = ikeys0[(block - 1) * 1 - 1];
key31 = ikeys1[block * 1 - 1];
key41 = ikeys1[(block - 1) * 1 - 1];
if (comp(key30, key31, key40, key41)) post_sum = oper(pre_sum[block / 2 - 1], pre_sum1[block / 2]);
else post_sum = pre_sum1[block / 2];
}
if (comp(key10, key11, key20, key21)) new_result = post_sum;
else new_result = init;
}
else new_result = scan_result;
shared.vals[l_id] = new_result;
}
sum = shared.vals[l_id];
for(size_t offset = 1; offset < 1; offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id >= offset)
{
key20 = shared.keys0[l_id - offset];
key21 = shared.keys1[l_id - offset];
if (comp(key0, key1, key20, key21)) sum = oper(sum, shared.vals[l_id - offset]);
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[l_id] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (g_id < n) ovals[g_id] = sum;
}
*** No errors detected
<end of output>
Test time = 0.04 sec
----------------------------------------------------------
Test Passed.
"scan_by_key" end time: Jan 30 13:43 IST
"scan_by_key" time elapsed: 00:00:00
----------------------------------------------------------
24/30 Testing: reduce_by_key
24/30 Test: reduce_by_key
Command: "/tmp/vexcl/build/tests/reduce_by_key"
Directory: /tmp/vexcl/build/tests
"reduce_by_key" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605614
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 3 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int x,
int y
)
{
return x == y;
}
kernel void offset_calculation
(
ulong n,
global const int * keys0,
global int * offsets
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
if (idx > 0) offsets[idx] = !comp(keys0[idx - 1], keys0[idx]);
else offsets[idx] = 0;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_inclusive_scan
(
ulong n,
global const int * input,
int identity,
global int * scan_buf1,
global int * scan_buf2,
int exclusive
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
size_t offset = 1;
local int shared[2];
if(block * 2 + l_id < n) shared[l_id] = input[block * 2 + l_id];
if(block * 2 + l_id + 1 < n) shared[l_id + 1] = input[block * 2 + l_id + 1];
if(exclusive && g_id == 0) shared[l_id] = oper(identity, input[0]);
for (size_t start = 1; start > 0; start >>= 1, offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id < start)
{
size_t temp1 = offset * (2 * l_id + 1) - 1;
size_t temp2 = offset * (2 * l_id + 2) - 1;
int y2 = shared[temp2];
int y1 = shared[temp1];
shared[temp2] = oper(y2, y1);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id == 0)
{
scan_buf1[ block ] = shared[1];
scan_buf2[ block ] = shared[0];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void intra_block_inclusive_scan
(
ulong n,
global int * post_sum,
global const int * pre_sum,
int identity,
uint work_per_thread
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t map_id = g_id * work_per_thread;
local int shared[1];
size_t offset;
int work_sum;
if (map_id < n)
{
offset = 0;
work_sum = pre_sum[map_id];
for( offset = 1; offset < work_per_thread; ++offset )
{
if (map_id + offset < n) work_sum = oper( work_sum, pre_sum[map_id + offset] );
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int scan_sum = work_sum;
shared[ l_id ] = work_sum;
for( offset = 1; offset < 1; offset *= 2 )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n && l_id >= offset) scan_sum = oper( scan_sum, shared[ l_id - offset ] );
barrier(CLK_LOCAL_MEM_FENCE);
shared[ l_id ] = scan_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
work_sum = pre_sum[map_id];
if (l_id > 0)
{
work_sum = oper(work_sum, shared[l_id - 1]);
post_sum[map_id] = work_sum;
}
else post_sum[map_id] = work_sum;
for( offset = 1; offset < work_per_thread; ++offset )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n && l_id > 0)
{
int y = oper(pre_sum[map_id + offset], work_sum);
post_sum[ map_id + offset ] = y;
work_sum = y;
}
else
{
post_sum[map_id + offset] = oper(pre_sum[map_id + offset], work_sum);
work_sum = post_sum[map_id + offset];
}
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_addition
(
ulong n,
global const int * input,
global int * output,
global int * post_sum,
global int * pre_sum,
int identity,
int exclusive
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
int val;
local int shared[1];
if (g_id < n)
{
if (exclusive) val = g_id > 0 ? input[g_id - 1] : identity;
else val = input[g_id];
}
shared[l_id] = val;
int scan_result = val;
int post_block_sum, new_result;
int y1, y2, sum;
if(l_id == 0 && g_id < n)
{
if(block > 0)
{
if(block % 2 == 0) post_block_sum = post_sum[ block/2 - 1 ];
else if(block == 1) post_block_sum = pre_sum[0];
else
{
y1 = post_sum[ block/2 - 1 ];
y2 = pre_sum [ block/2];
post_block_sum = oper(y1, y2);
}
new_result = exclusive ? post_block_sum : oper( scan_result, post_block_sum );
}
else new_result = scan_result;
shared[ l_id ] = new_result;
}
sum = shared[ l_id ];
for( size_t offset = 1; offset < 1; offset *= 2 )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id >= offset) sum = oper( sum, shared[ l_id - offset ] );
barrier(CLK_LOCAL_MEM_FENCE);
shared[ l_id ] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(g_id < n) output[ g_id ] = sum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double oper
(
double x,
double y
)
{
return x + y;
}
kernel void block_scan_by_key
(
ulong n,
global const int * keys,
global const double * vals,
global double * output,
global int * key_buf,
global double * val_buf
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
struct Shared
{
int keys[1];
double vals[1];
};
local struct Shared shared;
int key;
double val;
if (g_id < n)
{
key = keys[g_id];
val = vals[g_id];
shared.keys[l_id] = key;
shared.vals[l_id] = val;
}
double sum = val;
for(size_t offset = 1; offset < 1; offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id >= offset && shared.keys[l_id - offset] == key)
{
sum = oper(sum, shared.vals[l_id - offset]);
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[l_id] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (g_id >= n) return;
int key2 = -1;
if (g_id < n - 1) key2 = keys[g_id + 1];
if (key != key2) output[g_id] = sum;
if (l_id == 0)
{
key_buf[block] = shared.keys[0];
val_buf[block] = shared.vals[0];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double oper
(
double x,
double y
)
{
return x + y;
}
kernel void block_inclusive_scan_by_key
(
ulong n,
global const int * key_sum,
global const double * pre_sum,
global double * post_sum,
uint work_per_thread
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t map_id = g_id * work_per_thread;
struct Shared
{
int keys[1];
double vals[1];
};
local struct Shared shared;
uint offset;
int key;
double work_sum;
if (map_id < n)
{
int prev_key;
offset = 0;
key = key_sum[map_id];
work_sum = pre_sum[map_id];
post_sum[map_id] = work_sum;
for( offset = offset + 1; offset < work_per_thread; ++offset )
{
prev_key = key;
key = key_sum[ map_id + offset ];
if ( map_id + offset < n )
{
double y = pre_sum[ map_id + offset ];
if ( key == prev_key ) work_sum = oper( work_sum, y );
else work_sum = y;
post_sum[ map_id + offset ] = work_sum;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[ l_id ] = work_sum;
shared.keys[ l_id ] = key;
double scan_sum = work_sum;
for( offset = 1; offset < 1; offset *= 2 )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n)
{
if (l_id >= offset)
{
int key1 = shared.keys[ l_id ];
int key2 = shared.keys[ l_id - offset ];
if ( key1 == key2 ) scan_sum = oper( scan_sum, shared.vals[ l_id - offset ] );
else scan_sum = shared.vals[ l_id ];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[ l_id ] = scan_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
for( offset = 0; offset < work_per_thread; ++offset )
{
barrier(CLK_GLOBAL_MEM_FENCE);
if (map_id < n && l_id > 0)
{
double y = post_sum[ map_id + offset ];
int key1 = key_sum [ map_id + offset ];
int key2 = shared.keys[ l_id - 1 ];
if ( key1 == key2 ) y = oper( y, shared.vals[l_id - 1] );
post_sum[ map_id + offset ] = y;
}
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double oper
(
double x,
double y
)
{
return x + y;
}
kernel void block_sum_by_key
(
ulong n,
global const int * key_sum,
global const double * post_sum,
global const int * keys,
global double * output
)
{
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
if (g_id >= n) return;
int key2 = keys[ g_id ];
int key1 = (block > 0 ) ? key_sum[ block - 1 ] : key2 - 1;
int key3 = (g_id < n - 1) ? keys [ g_id + 1 ] : key2 - 1;
if (block > 0 && key1 == key2 && key2 != key3)
{
double scan_result = output [ g_id ];
double post_block_sum = post_sum[ block - 1 ];
output[ g_id ] = oper( scan_result, post_block_sum );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void key_value_mapping
(
ulong n,
global const int * ikeys0,
global int * okeys0,
global double * ovals,
global int * offset,
global const double * ivals
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
int num_sections = offset[n - 1] + 1;
int off = offset[idx];
if (idx < (n - 1) && off != offset[idx + 1])
{
okeys0[off] = ikeys0[idx];
ovals[off] = ivals[idx];
}
if (idx == (n - 1))
{
okeys0[num_sections - 1] = ikeys0[idx];
ovals[num_sections - 1] = ivals[idx];
}
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
bool comp
(
int a1,
long a2,
int b1,
long b2
)
{
return (a1 == b1) && (a2 == b2);
}
kernel void offset_calculation
(
ulong n,
global const int * keys0,
global const long * keys1,
global int * offsets
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
if (idx > 0) offsets[idx] = !comp(keys0[idx - 1], keys1[idx - 1], keys0[idx], keys1[idx]);
else offsets[idx] = 0;
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_inclusive_scan
(
ulong n,
global const int * input,
int identity,
global int * scan_buf1,
global int * scan_buf2,
int exclusive
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
size_t offset = 1;
local int shared[2];
if(block * 2 + l_id < n) shared[l_id] = input[block * 2 + l_id];
if(block * 2 + l_id + 1 < n) shared[l_id + 1] = input[block * 2 + l_id + 1];
if(exclusive && g_id == 0) shared[l_id] = oper(identity, input[0]);
for (size_t start = 1; start > 0; start >>= 1, offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id < start)
{
size_t temp1 = offset * (2 * l_id + 1) - 1;
size_t temp2 = offset * (2 * l_id + 2) - 1;
int y2 = shared[temp2];
int y1 = shared[temp1];
shared[temp2] = oper(y2, y1);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id == 0)
{
scan_buf1[ block ] = shared[1];
scan_buf2[ block ] = shared[0];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void intra_block_inclusive_scan
(
ulong n,
global int * post_sum,
global const int * pre_sum,
int identity,
uint work_per_thread
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t map_id = g_id * work_per_thread;
local int shared[1];
size_t offset;
int work_sum;
if (map_id < n)
{
offset = 0;
work_sum = pre_sum[map_id];
for( offset = 1; offset < work_per_thread; ++offset )
{
if (map_id + offset < n) work_sum = oper( work_sum, pre_sum[map_id + offset] );
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int scan_sum = work_sum;
shared[ l_id ] = work_sum;
for( offset = 1; offset < 1; offset *= 2 )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n && l_id >= offset) scan_sum = oper( scan_sum, shared[ l_id - offset ] );
barrier(CLK_LOCAL_MEM_FENCE);
shared[ l_id ] = scan_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
work_sum = pre_sum[map_id];
if (l_id > 0)
{
work_sum = oper(work_sum, shared[l_id - 1]);
post_sum[map_id] = work_sum;
}
else post_sum[map_id] = work_sum;
for( offset = 1; offset < work_per_thread; ++offset )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n && l_id > 0)
{
int y = oper(pre_sum[map_id + offset], work_sum);
post_sum[ map_id + offset ] = y;
work_sum = y;
}
else
{
post_sum[map_id + offset] = oper(pre_sum[map_id + offset], work_sum);
work_sum = post_sum[map_id + offset];
}
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
int oper
(
int x,
int y
)
{
return x + y;
}
kernel void block_addition
(
ulong n,
global const int * input,
global int * output,
global int * post_sum,
global int * pre_sum,
int identity,
int exclusive
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
int val;
local int shared[1];
if (g_id < n)
{
if (exclusive) val = g_id > 0 ? input[g_id - 1] : identity;
else val = input[g_id];
}
shared[l_id] = val;
int scan_result = val;
int post_block_sum, new_result;
int y1, y2, sum;
if(l_id == 0 && g_id < n)
{
if(block > 0)
{
if(block % 2 == 0) post_block_sum = post_sum[ block/2 - 1 ];
else if(block == 1) post_block_sum = pre_sum[0];
else
{
y1 = post_sum[ block/2 - 1 ];
y2 = pre_sum [ block/2];
post_block_sum = oper(y1, y2);
}
new_result = exclusive ? post_block_sum : oper( scan_result, post_block_sum );
}
else new_result = scan_result;
shared[ l_id ] = new_result;
}
sum = shared[ l_id ];
for( size_t offset = 1; offset < 1; offset *= 2 )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id >= offset) sum = oper( sum, shared[ l_id - offset ] );
barrier(CLK_LOCAL_MEM_FENCE);
shared[ l_id ] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(g_id < n) output[ g_id ] = sum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double oper
(
double x,
double y
)
{
return x + y;
}
kernel void block_scan_by_key
(
ulong n,
global const int * keys,
global const double * vals,
global double * output,
global int * key_buf,
global double * val_buf
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
struct Shared
{
int keys[1];
double vals[1];
};
local struct Shared shared;
int key;
double val;
if (g_id < n)
{
key = keys[g_id];
val = vals[g_id];
shared.keys[l_id] = key;
shared.vals[l_id] = val;
}
double sum = val;
for(size_t offset = 1; offset < 1; offset *= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id >= offset && shared.keys[l_id - offset] == key)
{
sum = oper(sum, shared.vals[l_id - offset]);
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[l_id] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (g_id >= n) return;
int key2 = -1;
if (g_id < n - 1) key2 = keys[g_id + 1];
if (key != key2) output[g_id] = sum;
if (l_id == 0)
{
key_buf[block] = shared.keys[0];
val_buf[block] = shared.vals[0];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double oper
(
double x,
double y
)
{
return x + y;
}
kernel void block_inclusive_scan_by_key
(
ulong n,
global const int * key_sum,
global const double * pre_sum,
global double * post_sum,
uint work_per_thread
)
{
size_t l_id = get_local_id(0);
size_t g_id = get_global_id(0);
size_t map_id = g_id * work_per_thread;
struct Shared
{
int keys[1];
double vals[1];
};
local struct Shared shared;
uint offset;
int key;
double work_sum;
if (map_id < n)
{
int prev_key;
offset = 0;
key = key_sum[map_id];
work_sum = pre_sum[map_id];
post_sum[map_id] = work_sum;
for( offset = offset + 1; offset < work_per_thread; ++offset )
{
prev_key = key;
key = key_sum[ map_id + offset ];
if ( map_id + offset < n )
{
double y = pre_sum[ map_id + offset ];
if ( key == prev_key ) work_sum = oper( work_sum, y );
else work_sum = y;
post_sum[ map_id + offset ] = work_sum;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[ l_id ] = work_sum;
shared.keys[ l_id ] = key;
double scan_sum = work_sum;
for( offset = 1; offset < 1; offset *= 2 )
{
barrier(CLK_LOCAL_MEM_FENCE);
if (map_id < n)
{
if (l_id >= offset)
{
int key1 = shared.keys[ l_id ];
int key2 = shared.keys[ l_id - offset ];
if ( key1 == key2 ) scan_sum = oper( scan_sum, shared.vals[ l_id - offset ] );
else scan_sum = shared.vals[ l_id ];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
shared.vals[ l_id ] = scan_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
for( offset = 0; offset < work_per_thread; ++offset )
{
barrier(CLK_GLOBAL_MEM_FENCE);
if (map_id < n && l_id > 0)
{
double y = post_sum[ map_id + offset ];
int key1 = key_sum [ map_id + offset ];
int key2 = shared.keys[ l_id - 1 ];
if ( key1 == key2 ) y = oper( y, shared.vals[l_id - 1] );
post_sum[ map_id + offset ] = y;
}
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double oper
(
double x,
double y
)
{
return x + y;
}
kernel void block_sum_by_key
(
ulong n,
global const int * key_sum,
global const double * post_sum,
global const int * keys,
global double * output
)
{
size_t g_id = get_global_id(0);
size_t block = get_group_id(0);
if (g_id >= n) return;
int key2 = keys[ g_id ];
int key1 = (block > 0 ) ? key_sum[ block - 1 ] : key2 - 1;
int key3 = (g_id < n - 1) ? keys [ g_id + 1 ] : key2 - 1;
if (block > 0 && key1 == key2 && key2 != key3)
{
double scan_result = output [ g_id ];
double post_block_sum = post_sum[ block - 1 ];
output[ g_id ] = oper( scan_result, post_block_sum );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void key_value_mapping
(
ulong n,
global const int * ikeys0,
global const long * ikeys1,
global int * okeys0,
global long * okeys1,
global double * ovals,
global int * offset,
global const double * ivals
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
int num_sections = offset[n - 1] + 1;
int off = offset[idx];
if (idx < (n - 1) && off != offset[idx + 1])
{
okeys0[off] = ikeys0[idx];
okeys1[off] = ikeys1[idx];
ovals[off] = ivals[idx];
}
if (idx == (n - 1))
{
okeys0[num_sections - 1] = ikeys0[idx];
okeys1[num_sections - 1] = ikeys1[idx];
ovals[num_sections - 1] = ivals[idx];
}
}
}
*** No errors detected
<end of output>
Test time = 2.03 sec
----------------------------------------------------------
Test Passed.
"reduce_by_key" end time: Jan 30 13:43 IST
"reduce_by_key" time elapsed: 00:00:02
----------------------------------------------------------
25/30 Testing: logical
25/30 Test: logical
Command: "/tmp/vexcl/build/tests/logical"
Directory: /tmp/vexcl/build/tests
"logical" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605616
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 2 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global int * prm_1,
ulong prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = (prm_2 + idx);
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_any_of_kernel
(
ulong n,
global int * prm_1,
global char * result
)
{
for(ulong idx = 0; idx < n; ++idx)
{
if (prm_1[idx])
{
result[0] = 1;
return;
}
}
result[0] = 0;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_any_of_kernel
(
ulong n,
int prm_1,
global int * prm_2,
global char * result
)
{
for(ulong idx = 0; idx < n; ++idx)
{
if (( prm_1 * prm_2[idx] ))
{
result[0] = 1;
return;
}
}
result[0] = 0;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_any_of_kernel
(
ulong n,
global int * prm_1,
ulong prm_2,
global char * result
)
{
for(ulong idx = 0; idx < n; ++idx)
{
if (( prm_1[idx] > prm_2 ))
{
result[0] = 1;
return;
}
}
result[0] = 0;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_any_of_kernel
(
ulong n,
global int * prm_1,
int prm_2,
global char * result
)
{
for(ulong idx = 0; idx < n; ++idx)
{
if (( prm_1[idx] < prm_2 ))
{
result[0] = 1;
return;
}
}
result[0] = 0;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_any_of_kernel
(
ulong n,
global int * prm_1,
global char * result
)
{
for(ulong idx = 0; idx < n; ++idx)
{
if (( !( prm_1[idx] ) ))
{
result[0] = 1;
return;
}
}
result[0] = 0;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_any_of_kernel
(
ulong n,
global int * prm_1,
int prm_2,
int prm_3,
global char * result
)
{
for(ulong idx = 0; idx < n; ++idx)
{
if (( !( ( ( prm_1[idx] + prm_2 ) > prm_3 ) ) ))
{
result[0] = 1;
return;
}
}
result[0] = 0;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_any_of_kernel
(
ulong n,
global int * prm_1,
ulong prm_2,
global char * result
)
{
for(ulong idx = 0; idx < n; ++idx)
{
if (( !( ( prm_1[idx] > prm_2 ) ) ))
{
result[0] = 1;
return;
}
}
result[0] = 0;
}
*** No errors detected
<end of output>
Test time = 0.04 sec
----------------------------------------------------------
Test Passed.
"logical" end time: Jan 30 13:43 IST
"logical" time elapsed: 00:00:00
----------------------------------------------------------
26/30 Testing: threads
26/30 Test: threads
Command: "/tmp/vexcl/build/tests/threads"
Directory: /tmp/vexcl/build/tests
"threads" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605616
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 2 test cases...
#i#fi fd edfeifniende(dc(lc_lk_hkrh_rf_pf6p46)4
)#
# p rpargamgam aO POEPNECNLC LE XETXETNESNISOINO Nc lc_lk_hkrh_rf_pf6p46:4 :e neanbalbel
e#
e#leilfi fd edfeifniende(dc(lc_la_madm_df_pf6p46)4
)#
# p rpargamgam aO POEPNECNLC LE XETXETNESNISOINO Nc lc_la_madm_df_pf6p46:4 :e neanbalbel
e#
e#nednidfi
f
k
ekrenrenle lv oviodi dv evxecxlc_lv_evcetcotro_rk_ekrenrenle
l(
(
u luolnogn gn ,n
,
g lgolboabla li nitn t* *p rpmr_m1_,1
,
i nitn tp rpmr_m2_
2)
){
{
u luolnogn gc hcuhnukn_ks_isziez e = =( n( n+ +g egte_tg_lgolboabla_ls_isziez(e0()0 )- -1 )1 )/ /g egte_tg_lgolboabla_ls_isziez(e0()0;)
;
u luolnogn gc hcuhnukn_ks_tsatratr t= =g egte_tg_lgolboabla_li_di(d0()0 )* *c hcuhnukn_ks_isziez;e
;
u luolnogn gc hcuhnukn_ke_nedn d = =c hcuhnukn_ks_tsatratr t+ +c hcuhnukn_ks_isziez;e
;
i fi f( n( n< <c hcuhnukn_ke_nedn)d )c hcuhnukn_ke_nedn d= =n ;n
;
f ofro(ru(luolnogn gi dixd x= =c hcuhnukn_ks_tsatratr;t ;i dixd x< <c hcuhnukn_ke_nedn;d ;+ ++i+dixd)x
)
{
{
p rpmr_m1_[1i[dixd]x ]= =p rpmr_m2_;2
;
}
}}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: e#niafb ldee
f#ienndife
d
(lcoln_gk hSrU_Mf_pl6o4n)g
#(
p long prm1,
long prm2
)
{
return prm1 + prm2;
}
rkaegrmnae lO PvEoNiCdL vEeXxTcElN_SrIeOdNu cctlo_rk_hkre_rfnpe6l4
:(
e n aublloen
g# enl,i
f dgelfoibnaeld (icnlt_ a*m dp_rfmp_614,)
# g lporbaaglm al oOnPgE N*C Lg _EoXdTaEtNaS
I)O
N{
c l _laomndg_ fmpy6S4u:m e=n a(blloen
g#)e0n;d
i f
u
lloonngg cShUuMn_kl_osnigz
e(
= l(onn g+ pgremt1_,g
l o blaoln_gs ipzrem(20
))
-{
1 ) r/e tguertn_ gplromb1a l+_ spirzme2(;0
)};
k e runleoln gv ocihdu nvke_xsctla_rrte d=u cgteotr__gkleorbnaell_
i(d
( 0 )u l*o ncgh unn,k
_ s igzleo;b
a l uilnotn g* cphrumn_k1_,e
n d g l o=b aclh ulnokn_gs t*a rgt_ o+d acthau
n)k
_{s
i z el;o
n g imfy S(unm <= c(hluonnkg_)e0n;d
) cuhluonnkg_ ecnhdu n=k _ns;i
z e f o=r ((unl o+n gg eitd_xg l=o bcahlu_nski_zset(a0r)t ;- i1d)x /< gcehtu_ngkl_oebnadl;_ s+i+zied(x0))
;
{
u l o n gm ycShuumn k=_ sStUaMr_tl o=n gg(emty_Sgulmo,b aplr_mi_d1([0i)d x*] )c;h
u n k}_
s i zge_;o
d a tual[ognegt _cghruonukp__eindd( 0 ) ]= =c hmuynSku_ms;t
a}r
t + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_long(mySum, prm_1[idx]);
}
g_odata[get_group_id(0)] = mySum;
}
*** No errors detected
<end of output>
Test time = 0.04 sec
----------------------------------------------------------
Test Passed.
"threads" end time: Jan 30 13:43 IST
"threads" time elapsed: 00:00:00
----------------------------------------------------------
27/30 Testing: multiple_objects
27/30 Test: multiple_objects
Command: "/tmp/vexcl/build/tests/multiple_objects"
Directory: /tmp/vexcl/build/tests
"multiple_objects" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
Running 1 test case...
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
*** No errors detected
<end of output>
Test time = 0.02 sec
----------------------------------------------------------
Test Passed.
"multiple_objects" end time: Jan 30 13:43 IST
"multiple_objects" time elapsed: 00:00:00
----------------------------------------------------------
28/30 Testing: boost_compute_sort
28/30 Test: boost_compute_sort
Command: "/tmp/vexcl/build/tests/boost_compute_sort"
Directory: /tmp/vexcl/build/tests
"boost_compute_sort" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605616
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 2 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
void philox_uint_2_10
(
uint * ctr,
uint * key
)
{
uint m[2];
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
}
float random_float_philox
(
ulong prm1,
ulong prm2
)
{
union
{
uint ctr[2];
uint res_i[1];
float res_f[1];
float res;
} ctr;
uint key[1];
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2;
key[0] = 0x12345678;
philox_uint_2_10(ctr.ctr, key);
ctr.res_f[0] = ctr.res_i[0] / 4294967295.0f;
return ctr.res;
}
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
ulong prm_2,
int prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = random_float_philox( (prm_2 + idx), prm_3 );
}
}
/Users/Rajesh/GDrive/codebase/repos/Qwixie/ext/headers/boost/compute/command_queue.hpp:1247: fatal error in "boost::compute::event boost::compute::command_queue::enqueue_nd_range_kernel(const boost::compute::kernel &, size_t, const size_t *, const size_t *, const size_t *, const boost::compute::wait_list &)": std::exception: Invalid Work Group Size
/tmp/vexcl/tests/context_setup.hpp:100: last checkpoint
*** 1 failure detected in test suite "BoostComputeSort"
<end of output>
Test time = 0.09 sec
----------------------------------------------------------
Test Failed.
"boost_compute_sort" end time: Jan 30 13:43 IST
"boost_compute_sort" time elapsed: 00:00:00
----------------------------------------------------------
29/30 Testing: boost_compute_scan
29/30 Test: boost_compute_scan
Command: "/tmp/vexcl/build/tests/boost_compute_scan"
Directory: /tmp/vexcl/build/tests
"boost_compute_scan" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605616
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 3 test cases...
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2[idx] + prm_3[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double * prm_1,
double prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] += prm_2;
}
}
*** No errors detected
<end of output>
Test time = 0.09 sec
----------------------------------------------------------
Test Passed.
"boost_compute_scan" end time: Jan 30 13:43 IST
"boost_compute_scan" time elapsed: 00:00:00
----------------------------------------------------------
30/30 Testing: fft
30/30 Test: fft
Command: "/tmp/vexcl/build/tests/fft"
Directory: /tmp/vexcl/build/tests
"fft" start time: Jan 30 13:43 IST
Output:
----------------------------------------------------------
seed: 1422605616
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple)
Running 4 test cases...
#define DEVICE
typedef float real_t;
typedef float2 real2_t;
float2 mul
(
float2 a,
float2 b
)
{
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
float2 twiddle
(
float alpha
)
{
float2 r = {native_cos(alpha), native_sin(alpha)};
return r;
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 52 FP additions, 8 FP multiplications,
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
* 43 stack variables, 1 constants, and 32 memory accesses
*/
DEVICE void
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
{
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t TF, TE, TD, TI;
{
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq,
Tt;
real_t TM;
{
real_t Tj, T4, T5, Tk;
{
real_t Tg, T1, T2, Th;
Tg = v0.y;
T1 = v0.x;
T2 = v4.x;
Th = v4.y;
Tj = v2.y;
T4 = v2.x;
Tn = T1 - T2;
T3 = T1 + T2;
TC = Tg - Th;
Ti = Tg + Th;
T5 = v6.x;
Tk = v6.y;
}
{
real_t Tw, Tb, Tc, Tx;
Tw = v7.y;
Tb = v7.x;
TB = T4 - T5;
T6 = T4 + T5;
To = Tj - Tk;
Tl = Tj + Tk;
Tc = v3.x;
Tx = v3.y;
{
real_t Tr, T8, Tv, Ty, T9, Ts;
Tr = v1.y;
T8 = v1.x;
Td = Tb + Tc;
Tv = Tb - Tc;
TN = Tw + Tx;
Ty = Tw - Tx;
T9 = v5.x;
Ts = v5.y;
Tz = Tv - Ty;
TH = Tv + Ty;
Ta = T8 + T9;
Tq = T8 - T9;
Tt = Tr - Ts;
TM = Tr + Ts;
}
}
}
{
real_t TL, TG, Tu, Tf, Tm, TO;
{
real_t T7, Te, TP, TQ;
TL = T3 - T6;
T7 = T3 + T6;
TG = Tt - Tq;
Tu = Tq + Tt;
Te = Ta + Td;
Tf = Td - Ta;
Tm = Ti - Tl;
TP = Ti + Tl;
TQ = TM + TN;
TO = TM - TN;
v0.x = T7 + Te;
v0.y = TP + TQ;
v4.y = TP - TQ;
v4.x = T7 - Te;
}
{
real_t Tp, TA, TJ, TK;
TF = Tn - To;
Tp = Tn + To;
v2.y = Tf + Tm;
v2.x = TL + TO;
v6.x = TL - TO;
v6.y = Tm - Tf;
TA = Tu + Tz;
TE = Tz - Tu;
TD = TB + TC;
TJ = TC - TB;
TK = TG + TH;
TI = TG - TH;
v1.x = fma (KP707106781, TA, Tp);
v1.y = fma (KP707106781, TK, TJ);
v5.y = fma (-KP707106781, TK, TJ);
v5.x = fma (-KP707106781, TA, Tp);
}
}
}
v3.y = fma (KP707106781, TE, TD);
v3.x = fma (KP707106781, TI, TF);
v7.x = fma (-KP707106781, TI, TF);
v7.y = fma (-KP707106781, TE, TD);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
}
kernel void radix
(
global const float2 * x,
global float2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 8;
x += i + batch_offset;
float2 v0 = x[0 * threads];
float2 v1 = x[1 * threads];
float2 v2 = x[2 * threads];
float2 v3 = x[3 * threads];
float2 v4 = x[4 * threads];
float2 v5 = x[5 * threads];
float2 v6 = x[6 * threads];
float2 v7 = x[7 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p));
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p));
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p));
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p));
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p));
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p));
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p));
}
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7);
const size_t j = k + (i - k) * 8;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
}
#define DEVICE
typedef float real_t;
typedef float2 real2_t;
float2 mul
(
float2 a,
float2 b
)
{
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
float2 twiddle
(
float alpha
)
{
float2 r = {native_cos(alpha), native_sin(alpha)};
return r;
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 52 FP additions, 8 FP multiplications,
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
* 43 stack variables, 1 constants, and 32 memory accesses
*/
DEVICE void
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
{
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t TF, TE, TD, TI;
{
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq,
Tt;
real_t TM;
{
real_t Tj, T4, T5, Tk;
{
real_t Tg, T1, T2, Th;
Tg = v0.y;
T1 = v0.x;
T2 = v4.x;
Th = v4.y;
Tj = v2.y;
T4 = v2.x;
Tn = T1 - T2;
T3 = T1 + T2;
TC = Tg - Th;
Ti = Tg + Th;
T5 = v6.x;
Tk = v6.y;
}
{
real_t Tw, Tb, Tc, Tx;
Tw = v7.y;
Tb = v7.x;
TB = T4 - T5;
T6 = T4 + T5;
To = Tj - Tk;
Tl = Tj + Tk;
Tc = v3.x;
Tx = v3.y;
{
real_t Tr, T8, Tv, Ty, T9, Ts;
Tr = v1.y;
T8 = v1.x;
Td = Tb + Tc;
Tv = Tb - Tc;
TN = Tw + Tx;
Ty = Tw - Tx;
T9 = v5.x;
Ts = v5.y;
Tz = Tv - Ty;
TH = Tv + Ty;
Ta = T8 + T9;
Tq = T8 - T9;
Tt = Tr - Ts;
TM = Tr + Ts;
}
}
}
{
real_t TL, TG, Tu, Tf, Tm, TO;
{
real_t T7, Te, TP, TQ;
TL = T3 - T6;
T7 = T3 + T6;
TG = Tt - Tq;
Tu = Tq + Tt;
Te = Ta + Td;
Tf = Td - Ta;
Tm = Ti - Tl;
TP = Ti + Tl;
TQ = TM + TN;
TO = TM - TN;
v0.x = T7 + Te;
v0.y = TP + TQ;
v4.y = TP - TQ;
v4.x = T7 - Te;
}
{
real_t Tp, TA, TJ, TK;
TF = Tn - To;
Tp = Tn + To;
v2.y = Tf + Tm;
v2.x = TL + TO;
v6.x = TL - TO;
v6.y = Tm - Tf;
TA = Tu + Tz;
TE = Tz - Tu;
TD = TB + TC;
TJ = TC - TB;
TK = TG + TH;
TI = TG - TH;
v1.x = fma (KP707106781, TA, Tp);
v1.y = fma (KP707106781, TK, TJ);
v5.y = fma (-KP707106781, TK, TJ);
v5.x = fma (-KP707106781, TA, Tp);
}
}
}
v3.y = fma (KP707106781, TE, TD);
v3.x = fma (KP707106781, TI, TF);
v7.x = fma (-KP707106781, TI, TF);
v7.y = fma (-KP707106781, TE, TD);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
}
kernel void radix
(
global const float2 * x,
global float2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 8;
x += i + batch_offset;
float2 v0 = x[0 * threads];
float2 v1 = x[1 * threads];
float2 v2 = x[2 * threads];
float2 v3 = x[3 * threads];
float2 v4 = x[4 * threads];
float2 v5 = x[5 * threads];
float2 v6 = x[6 * threads];
float2 v7 = x[7 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p));
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p));
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p));
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p));
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p));
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p));
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p));
}
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7);
const size_t j = k + (i - k) * 8;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
}
#define DEVICE
typedef float real_t;
typedef float2 real2_t;
float2 mul
(
float2 a,
float2 b
)
{
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
float2 twiddle
(
float alpha
)
{
float2 r = {native_cos(alpha), native_sin(alpha)};
return r;
}/* Generated by: ./cl_gen_notw.native -n 16 -name dft16 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 144 FP additions, 40 FP multiplications,
* (or, 104 additions, 0 multiplications, 40 fused multiply/add),
* 97 stack variables, 3 constants, and 64 memory accesses
*/
DEVICE void
dft16 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10, real2_t * u11, real2_t * u12, real2_t * u13,
real2_t * u14, real2_t * u15)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
real2_t v11 = *u11;
real2_t v12 = *u12;
real2_t v13 = *u13;
real2_t v14 = *u14;
real2_t v15 = *u15;
{
const real_t KP923879532 =
+0.923879532511286756128183189396788286822416626;
const real_t KP414213562 =
+0.414213562373095048801688724209698078569671875;
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t T1z, T1L, T1M, T1N, T1P, T1J, T1K, T1G, T1O, T1Q;
{
real_t T1l, T1H, T1R, T7, T1x, TN, TC, T25, T1E, T1b, T1Z, Tt,
T2h, T22, T1D;
real_t T1g, T1n, TQ, Te, T26, TT, T1m, TJ, T1S, T12, Tj, T11,
Ti, T1V, TZ;
real_t Tk, T13;
{
real_t T1d, Tq, T1c, Tp, T20, T1a, Tr, T1e;
{
real_t Tz, T4, TL, T3, T1k, Ty, T5, TA;
{
real_t Tw, T1, T2, Tx;
Tw = v0.y;
T1 = v0.x;
T2 = v8.x;
Tx = v8.y;
Tz = v4.y;
T4 = v4.x;
TL = T1 - T2;
T3 = T1 + T2;
T1k = Tw - Tx;
Ty = Tw + Tx;
T5 = v12.x;
TA = v12.y;
}
{
real_t T18, Tn, To, T19;
T18 = v15.y;
Tn = v15.x;
{
real_t T1j, T6, TM, TB;
T1j = T4 - T5;
T6 = T4 + T5;
TM = Tz - TA;
TB = Tz + TA;
T1l = T1j + T1k;
T1H = T1k - T1j;
T1R = T3 - T6;
T7 = T3 + T6;
T1x = TL + TM;
TN = TL - TM;
TC = Ty + TB;
T25 = Ty - TB;
To = v7.x;
T19 = v7.y;
}
T1d = v3.y;
Tq = v3.x;
T1c = Tn - To;
Tp = Tn + To;
T20 = T18 + T19;
T1a = T18 - T19;
Tr = v11.x;
T1e = v11.y;
}
}
{
real_t TG, Tb, TP, Ta, TO, TF, Tc, TH;
{
real_t TD, T8, T9, TE;
TD = v2.y;
T8 = v2.x;
{
real_t T17, Ts, T21, T1f;
T17 = Tq - Tr;
Ts = Tq + Tr;
T21 = T1d + T1e;
T1f = T1d - T1e;
T1E = T1a - T17;
T1b = T17 + T1a;
T1Z = Tp - Ts;
Tt = Tp + Ts;
T2h = T20 + T21;
T22 = T20 - T21;
T1D = T1c + T1f;
T1g = T1c - T1f;
T9 = v10.x;
TE = v10.y;
}
TG = v14.y;
Tb = v14.x;
TP = T8 - T9;
Ta = T8 + T9;
TO = TD - TE;
TF = TD + TE;
Tc = v6.x;
TH = v6.y;
}
{
real_t TR, Td, TS, TI;
T1n = TP + TO;
TQ = TO - TP;
TR = Tb - Tc;
Td = Tb + Tc;
TS = TG - TH;
TI = TG + TH;
Te = Ta + Td;
T26 = Td - Ta;
TT = TR + TS;
T1m = TR - TS;
TJ = TF + TI;
T1S = TF - TI;
}
}
{
real_t TX, Tg, Th, TY;
TX = v1.y;
Tg = v1.x;
Th = v9.x;
TY = v9.y;
T12 = v5.y;
Tj = v5.x;
T11 = Tg - Th;
Ti = Tg + Th;
T1V = TX + TY;
TZ = TX - TY;
Tk = v13.x;
T13 = v13.y;
}
}
{
real_t T2f, T1B, T10, T1U, T1X, T1A, T15, Tv, TK, T2i;
{
real_t Tf, Tu, T2j, T2k, T2g;
T2f = T7 - Te;
Tf = T7 + Te;
{
real_t TW, Tl, T1W, T14, Tm;
TW = Tj - Tk;
Tl = Tj + Tk;
T1W = T12 + T13;
T14 = T12 - T13;
T1B = TZ - TW;
T10 = TW + TZ;
T1U = Ti - Tl;
Tm = Ti + Tl;
T2g = T1V + T1W;
T1X = T1V - T1W;
T1A = T11 + T14;
T15 = T11 - T14;
Tu = Tm + Tt;
Tv = Tt - Tm;
}
TK = TC - TJ;
T2j = TC + TJ;
T2k = T2g + T2h;
T2i = T2g - T2h;
v0.x = Tf + Tu;
v0.y = T2j + T2k;
v8.y = T2j - T2k;
v8.x = Tf - Tu;
}
{
real_t T29, T1T, T27, T2d, T2a, T2b, T28, T24, T1Y,
T23;
T29 = T1R - T1S;
T1T = T1R + T1S;
v4.y = Tv + TK;
v4.x = T2f + T2i;
v12.x = T2f - T2i;
v12.y = TK - Tv;
T27 = T25 - T26;
T2d = T26 + T25;
T2a = T1X - T1U;
T1Y = T1U + T1X;
T23 = T1Z - T22;
T2b = T1Z + T22;
T28 = T23 - T1Y;
T24 = T1Y + T23;
{
real_t T1I, TV, T1v, T1y, T1t, T1s, T1r, T1p, T1q,
T1i;
{
real_t T1o, T2e, T2c, TU, T16, T1h;
T1I = TQ + TT;
TU = TQ - TT;
T2e = T2a + T2b;
T2c = T2a - T2b;
TV = fma (KP707106781, TU, TN);
T1v = fma (-KP707106781, TU, TN);
v10.x = fma (-KP707106781, T24, T1T);
v10.y = fma (-KP707106781, T2e, T2d);
v2.y = fma (KP707106781, T2e, T2d);
v2.x = fma (KP707106781, T24, T1T);
v6.y = fma (KP707106781, T28, T27);
v6.x = fma (KP707106781, T2c, T29);
v14.x = fma (-KP707106781, T2c, T29);
v14.y = fma (-KP707106781, T28, T27);
T1o = T1m - T1n;
T1y = T1n + T1m;
T1t = fma (-KP414213562, T10, T15);
T16 = fma (KP414213562, T15, T10);
T1h = fma (-KP414213562, T1g, T1b);
T1s = fma (KP414213562, T1b, T1g);
T1r = fma (KP707106781, T1o, T1l);
T1p = fma (-KP707106781, T1o, T1l);
T1q = T16 + T1h;
T1i = T16 - T1h;
}
{
real_t T1w, T1u, T1C, T1F;
T1w = T1t + T1s;
T1u = T1s - T1t;
T1z = fma (KP707106781, T1y, T1x);
T1L = fma (-KP707106781, T1y, T1x);
v15.y = fma (KP923879532, T1q, T1p);
v15.x = fma (KP923879532, T1w, T1v);
v7.x = fma (-KP923879532, T1w, T1v);
v7.y = fma (-KP923879532, T1q, T1p);
v3.x = fma (KP923879532, T1i, TV);
v3.y = fma (KP923879532, T1u, T1r);
v11.y = fma (-KP923879532, T1u, T1r);
v11.x = fma (-KP923879532, T1i, TV);
T1M = fma (-KP414213562, T1A, T1B);
T1C = fma (KP414213562, T1B, T1A);
T1F = fma (-KP414213562, T1E, T1D);
T1N = fma (KP414213562, T1D, T1E);
T1P = fma (KP707106781, T1I, T1H);
T1J = fma (-KP707106781, T1I, T1H);
T1K = T1F - T1C;
T1G = T1C + T1F;
}
}
}
}
}
T1O = T1M - T1N;
T1Q = T1M + T1N;
v1.x = fma (KP923879532, T1G, T1z);
v1.y = fma (KP923879532, T1Q, T1P);
v9.y = fma (-KP923879532, T1Q, T1P);
v9.x = fma (-KP923879532, T1G, T1z);
v5.y = fma (KP923879532, T1K, T1J);
v5.x = fma (KP923879532, T1O, T1L);
v13.x = fma (-KP923879532, T1O, T1L);
v13.y = fma (-KP923879532, T1K, T1J);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
*u11 = v11;
*u12 = v12;
*u13 = v13;
*u14 = v14;
*u15 = v15;
}
kernel void radix
(
global const float2 * x,
global float2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 16;
x += i + batch_offset;
float2 v0 = x[0 * threads];
float2 v1 = x[1 * threads];
float2 v2 = x[2 * threads];
float2 v3 = x[3 * threads];
float2 v4 = x[4 * threads];
float2 v5 = x[5 * threads];
float2 v6 = x[6 * threads];
float2 v7 = x[7 * threads];
float2 v8 = x[8 * threads];
float2 v9 = x[9 * threads];
float2 v10 = x[10 * threads];
float2 v11 = x[11 * threads];
float2 v12 = x[12 * threads];
float2 v13 = x[13 * threads];
float2 v14 = x[14 * threads];
float2 v15 = x[15 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((float)-0.3926990926265717 * k / p));
v2 = mul(v2, twiddle((float)-0.7853981852531433 * k / p));
v3 = mul(v3, twiddle((float)-1.178097248077393 * k / p));
v4 = mul(v4, twiddle((float)-1.570796370506287 * k / p));
v5 = mul(v5, twiddle((float)-1.963495492935181 * k / p));
v6 = mul(v6, twiddle((float)-2.356194496154785 * k / p));
v7 = mul(v7, twiddle((float)-2.748893737792969 * k / p));
v8 = mul(v8, twiddle((float)-3.141592741012573 * k / p));
v9 = mul(v9, twiddle((float)-3.534291744232178 * k / p));
v10 = mul(v10, twiddle((float)-3.926990985870361 * k / p));
v11 = mul(v11, twiddle((float)-4.319690227508545 * k / p));
v12 = mul(v12, twiddle((float)-4.71238899230957 * k / p));
v13 = mul(v13, twiddle((float)-5.105088233947754 * k / p));
v14 = mul(v14, twiddle((float)-5.497787475585938 * k / p));
v15 = mul(v15, twiddle((float)-5.890486240386963 * k / p));
}
dft16(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15);
const size_t j = k + (i - k) * 16;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
y[11 * p] = v11;
y[12 * p] = v12;
y[13 * p] = v13;
y[14 * p] = v14;
y[15 * p] = v15;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
float2 r2c
(
float v
)
{
float2 r = {v, 0}; return r;
}
kernel void vexcl_vector_kernel
(
ulong n,
global float2 * prm_1,
global float * prm_2,
global float * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = r2c( ( prm_2[idx] * prm_3[idx] ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
float c2r
(
float2 v
)
{
return v.x;
}
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
float prm_2,
global float2 * prm_3,
int prm_4
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] += ( ( prm_2 * c2r( prm_3[idx] ) ) * prm_4 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
void philox_uint_2_10
(
uint * ctr,
uint * key
)
{
uint m[2];
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
key[0] += 0x9E3779B9;
m[0] = mul_hi(0xD256D193, ctr[0]);
m[1] = 0xD256D193 * ctr[0];
ctr[0] = m[0] ^ key[0] ^ ctr[1];
ctr[1] = m[1];
}
float random_float_philox
(
ulong prm1,
ulong prm2
)
{
union
{
uint ctr[2];
uint res_i[1];
float res_f[1];
float res;
} ctr;
uint key[1];
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2;
key[0] = 0x12345678;
philox_uint_2_10(ctr.ctr, key);
ctr.res_f[0] = ctr.res_i[0] / 4294967295.0f;
return ctr.res;
}
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
ulong prm_2,
int prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = random_float_philox( (prm_2 + idx), prm_3 );
}
}
#define DEVICE
typedef float real_t;
typedef float2 real2_t;
float2 mul
(
float2 a,
float2 b
)
{
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
float2 twiddle
(
float alpha
)
{
float2 r = {native_cos(alpha), native_sin(alpha)};
return r;
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 52 FP additions, 8 FP multiplications,
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
* 43 stack variables, 1 constants, and 32 memory accesses
*/
DEVICE void
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
{
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t TF, TE, TD, TI;
{
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq,
Tt;
real_t TM;
{
real_t Tj, T4, T5, Tk;
{
real_t Tg, T1, T2, Th;
Tg = v0.y;
T1 = v0.x;
T2 = v4.x;
Th = v4.y;
Tj = v2.y;
T4 = v2.x;
Tn = T1 - T2;
T3 = T1 + T2;
TC = Tg - Th;
Ti = Tg + Th;
T5 = v6.x;
Tk = v6.y;
}
{
real_t Tw, Tb, Tc, Tx;
Tw = v7.y;
Tb = v7.x;
TB = T4 - T5;
T6 = T4 + T5;
To = Tj - Tk;
Tl = Tj + Tk;
Tc = v3.x;
Tx = v3.y;
{
real_t Tr, T8, Tv, Ty, T9, Ts;
Tr = v1.y;
T8 = v1.x;
Td = Tb + Tc;
Tv = Tb - Tc;
TN = Tw + Tx;
Ty = Tw - Tx;
T9 = v5.x;
Ts = v5.y;
Tz = Tv - Ty;
TH = Tv + Ty;
Ta = T8 + T9;
Tq = T8 - T9;
Tt = Tr - Ts;
TM = Tr + Ts;
}
}
}
{
real_t TL, TG, Tu, Tf, Tm, TO;
{
real_t T7, Te, TP, TQ;
TL = T3 - T6;
T7 = T3 + T6;
TG = Tt - Tq;
Tu = Tq + Tt;
Te = Ta + Td;
Tf = Td - Ta;
Tm = Ti - Tl;
TP = Ti + Tl;
TQ = TM + TN;
TO = TM - TN;
v0.x = T7 + Te;
v0.y = TP + TQ;
v4.y = TP - TQ;
v4.x = T7 - Te;
}
{
real_t Tp, TA, TJ, TK;
TF = Tn - To;
Tp = Tn + To;
v2.y = Tf + Tm;
v2.x = TL + TO;
v6.x = TL - TO;
v6.y = Tm - Tf;
TA = Tu + Tz;
TE = Tz - Tu;
TD = TB + TC;
TJ = TC - TB;
TK = TG + TH;
TI = TG - TH;
v1.x = fma (KP707106781, TA, Tp);
v1.y = fma (KP707106781, TK, TJ);
v5.y = fma (-KP707106781, TK, TJ);
v5.x = fma (-KP707106781, TA, Tp);
}
}
}
v3.y = fma (KP707106781, TE, TD);
v3.x = fma (KP707106781, TI, TF);
v7.x = fma (-KP707106781, TI, TF);
v7.y = fma (-KP707106781, TE, TD);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
}
kernel void radix
(
global const float2 * x,
global float2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 8;
x += i + batch_offset;
float2 v0 = x[0 * threads];
float2 v1 = x[1 * threads];
float2 v2 = x[2 * threads];
float2 v3 = x[3 * threads];
float2 v4 = x[4 * threads];
float2 v5 = x[5 * threads];
float2 v6 = x[6 * threads];
float2 v7 = x[7 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p));
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p));
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p));
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p));
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p));
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p));
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p));
}
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7);
const size_t j = k + (i - k) * 8;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
}
#define DEVICE
typedef float real_t;
typedef float2 real2_t;
float2 mul
(
float2 a,
float2 b
)
{
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
float2 twiddle
(
float alpha
)
{
float2 r = {native_cos(alpha), native_sin(alpha)};
return r;
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 52 FP additions, 8 FP multiplications,
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
* 43 stack variables, 1 constants, and 32 memory accesses
*/
DEVICE void
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
{
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t TF, TE, TD, TI;
{
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq,
Tt;
real_t TM;
{
real_t Tj, T4, T5, Tk;
{
real_t Tg, T1, T2, Th;
Tg = v0.y;
T1 = v0.x;
T2 = v4.x;
Th = v4.y;
Tj = v2.y;
T4 = v2.x;
Tn = T1 - T2;
T3 = T1 + T2;
TC = Tg - Th;
Ti = Tg + Th;
T5 = v6.x;
Tk = v6.y;
}
{
real_t Tw, Tb, Tc, Tx;
Tw = v7.y;
Tb = v7.x;
TB = T4 - T5;
T6 = T4 + T5;
To = Tj - Tk;
Tl = Tj + Tk;
Tc = v3.x;
Tx = v3.y;
{
real_t Tr, T8, Tv, Ty, T9, Ts;
Tr = v1.y;
T8 = v1.x;
Td = Tb + Tc;
Tv = Tb - Tc;
TN = Tw + Tx;
Ty = Tw - Tx;
T9 = v5.x;
Ts = v5.y;
Tz = Tv - Ty;
TH = Tv + Ty;
Ta = T8 + T9;
Tq = T8 - T9;
Tt = Tr - Ts;
TM = Tr + Ts;
}
}
}
{
real_t TL, TG, Tu, Tf, Tm, TO;
{
real_t T7, Te, TP, TQ;
TL = T3 - T6;
T7 = T3 + T6;
TG = Tt - Tq;
Tu = Tq + Tt;
Te = Ta + Td;
Tf = Td - Ta;
Tm = Ti - Tl;
TP = Ti + Tl;
TQ = TM + TN;
TO = TM - TN;
v0.x = T7 + Te;
v0.y = TP + TQ;
v4.y = TP - TQ;
v4.x = T7 - Te;
}
{
real_t Tp, TA, TJ, TK;
TF = Tn - To;
Tp = Tn + To;
v2.y = Tf + Tm;
v2.x = TL + TO;
v6.x = TL - TO;
v6.y = Tm - Tf;
TA = Tu + Tz;
TE = Tz - Tu;
TD = TB + TC;
TJ = TC - TB;
TK = TG + TH;
TI = TG - TH;
v1.x = fma (KP707106781, TA, Tp);
v1.y = fma (KP707106781, TK, TJ);
v5.y = fma (-KP707106781, TK, TJ);
v5.x = fma (-KP707106781, TA, Tp);
}
}
}
v3.y = fma (KP707106781, TE, TD);
v3.x = fma (KP707106781, TI, TF);
v7.x = fma (-KP707106781, TI, TF);
v7.y = fma (-KP707106781, TE, TD);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
}
kernel void radix
(
global const float2 * x,
global float2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 8;
x += i + batch_offset;
float2 v0 = x[0 * threads];
float2 v1 = x[1 * threads];
float2 v2 = x[2 * threads];
float2 v3 = x[3 * threads];
float2 v4 = x[4 * threads];
float2 v5 = x[5 * threads];
float2 v6 = x[6 * threads];
float2 v7 = x[7 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p));
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p));
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p));
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p));
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p));
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p));
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p));
}
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7);
const size_t j = k + (i - k) * 8;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
}
#define DEVICE
typedef float real_t;
typedef float2 real2_t;
float2 mul
(
float2 a,
float2 b
)
{
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
float2 twiddle
(
float alpha
)
{
float2 r = {native_cos(alpha), native_sin(alpha)};
return r;
}/* Generated by: ./cl_gen_notw.native -n 16 -name dft16 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 144 FP additions, 40 FP multiplications,
* (or, 104 additions, 0 multiplications, 40 fused multiply/add),
* 97 stack variables, 3 constants, and 64 memory accesses
*/
DEVICE void
dft16 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10, real2_t * u11, real2_t * u12, real2_t * u13,
real2_t * u14, real2_t * u15)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
real2_t v11 = *u11;
real2_t v12 = *u12;
real2_t v13 = *u13;
real2_t v14 = *u14;
real2_t v15 = *u15;
{
const real_t KP923879532 =
+0.923879532511286756128183189396788286822416626;
const real_t KP414213562 =
+0.414213562373095048801688724209698078569671875;
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t T1z, T1L, T1M, T1N, T1P, T1J, T1K, T1G, T1O, T1Q;
{
real_t T1l, T1H, T1R, T7, T1x, TN, TC, T25, T1E, T1b, T1Z, Tt,
T2h, T22, T1D;
real_t T1g, T1n, TQ, Te, T26, TT, T1m, TJ, T1S, T12, Tj, T11,
Ti, T1V, TZ;
real_t Tk, T13;
{
real_t T1d, Tq, T1c, Tp, T20, T1a, Tr, T1e;
{
real_t Tz, T4, TL, T3, T1k, Ty, T5, TA;
{
real_t Tw, T1, T2, Tx;
Tw = v0.y;
T1 = v0.x;
T2 = v8.x;
Tx = v8.y;
Tz = v4.y;
T4 = v4.x;
TL = T1 - T2;
T3 = T1 + T2;
T1k = Tw - Tx;
Ty = Tw + Tx;
T5 = v12.x;
TA = v12.y;
}
{
real_t T18, Tn, To, T19;
T18 = v15.y;
Tn = v15.x;
{
real_t T1j, T6, TM, TB;
T1j = T4 - T5;
T6 = T4 + T5;
TM = Tz - TA;
TB = Tz + TA;
T1l = T1j + T1k;
T1H = T1k - T1j;
T1R = T3 - T6;
T7 = T3 + T6;
T1x = TL + TM;
TN = TL - TM;
TC = Ty + TB;
T25 = Ty - TB;
To = v7.x;
T19 = v7.y;
}
T1d = v3.y;
Tq = v3.x;
T1c = Tn - To;
Tp = Tn + To;
T20 = T18 + T19;
T1a = T18 - T19;
Tr = v11.x;
T1e = v11.y;
}
}
{
real_t TG, Tb, TP, Ta, TO, TF, Tc, TH;
{
real_t TD, T8, T9, TE;
TD = v2.y;
T8 = v2.x;
{
real_t T17, Ts, T21, T1f;
T17 = Tq - Tr;
Ts = Tq + Tr;
T21 = T1d + T1e;
T1f = T1d - T1e;
T1E = T1a - T17;
T1b = T17 + T1a;
T1Z = Tp - Ts;
Tt = Tp + Ts;
T2h = T20 + T21;
T22 = T20 - T21;
T1D = T1c + T1f;
T1g = T1c - T1f;
T9 = v10.x;
TE = v10.y;
}
TG = v14.y;
Tb = v14.x;
TP = T8 - T9;
Ta = T8 + T9;
TO = TD - TE;
TF = TD + TE;
Tc = v6.x;
TH = v6.y;
}
{
real_t TR, Td, TS, TI;
T1n = TP + TO;
TQ = TO - TP;
TR = Tb - Tc;
Td = Tb + Tc;
TS = TG - TH;
TI = TG + TH;
Te = Ta + Td;
T26 = Td - Ta;
TT = TR + TS;
T1m = TR - TS;
TJ = TF + TI;
T1S = TF - TI;
}
}
{
real_t TX, Tg, Th, TY;
TX = v1.y;
Tg = v1.x;
Th = v9.x;
TY = v9.y;
T12 = v5.y;
Tj = v5.x;
T11 = Tg - Th;
Ti = Tg + Th;
T1V = TX + TY;
TZ = TX - TY;
Tk = v13.x;
T13 = v13.y;
}
}
{
real_t T2f, T1B, T10, T1U, T1X, T1A, T15, Tv, TK, T2i;
{
real_t Tf, Tu, T2j, T2k, T2g;
T2f = T7 - Te;
Tf = T7 + Te;
{
real_t TW, Tl, T1W, T14, Tm;
TW = Tj - Tk;
Tl = Tj + Tk;
T1W = T12 + T13;
T14 = T12 - T13;
T1B = TZ - TW;
T10 = TW + TZ;
T1U = Ti - Tl;
Tm = Ti + Tl;
T2g = T1V + T1W;
T1X = T1V - T1W;
T1A = T11 + T14;
T15 = T11 - T14;
Tu = Tm + Tt;
Tv = Tt - Tm;
}
TK = TC - TJ;
T2j = TC + TJ;
T2k = T2g + T2h;
T2i = T2g - T2h;
v0.x = Tf + Tu;
v0.y = T2j + T2k;
v8.y = T2j - T2k;
v8.x = Tf - Tu;
}
{
real_t T29, T1T, T27, T2d, T2a, T2b, T28, T24, T1Y,
T23;
T29 = T1R - T1S;
T1T = T1R + T1S;
v4.y = Tv + TK;
v4.x = T2f + T2i;
v12.x = T2f - T2i;
v12.y = TK - Tv;
T27 = T25 - T26;
T2d = T26 + T25;
T2a = T1X - T1U;
T1Y = T1U + T1X;
T23 = T1Z - T22;
T2b = T1Z + T22;
T28 = T23 - T1Y;
T24 = T1Y + T23;
{
real_t T1I, TV, T1v, T1y, T1t, T1s, T1r, T1p, T1q,
T1i;
{
real_t T1o, T2e, T2c, TU, T16, T1h;
T1I = TQ + TT;
TU = TQ - TT;
T2e = T2a + T2b;
T2c = T2a - T2b;
TV = fma (KP707106781, TU, TN);
T1v = fma (-KP707106781, TU, TN);
v10.x = fma (-KP707106781, T24, T1T);
v10.y = fma (-KP707106781, T2e, T2d);
v2.y = fma (KP707106781, T2e, T2d);
v2.x = fma (KP707106781, T24, T1T);
v6.y = fma (KP707106781, T28, T27);
v6.x = fma (KP707106781, T2c, T29);
v14.x = fma (-KP707106781, T2c, T29);
v14.y = fma (-KP707106781, T28, T27);
T1o = T1m - T1n;
T1y = T1n + T1m;
T1t = fma (-KP414213562, T10, T15);
T16 = fma (KP414213562, T15, T10);
T1h = fma (-KP414213562, T1g, T1b);
T1s = fma (KP414213562, T1b, T1g);
T1r = fma (KP707106781, T1o, T1l);
T1p = fma (-KP707106781, T1o, T1l);
T1q = T16 + T1h;
T1i = T16 - T1h;
}
{
real_t T1w, T1u, T1C, T1F;
T1w = T1t + T1s;
T1u = T1s - T1t;
T1z = fma (KP707106781, T1y, T1x);
T1L = fma (-KP707106781, T1y, T1x);
v15.y = fma (KP923879532, T1q, T1p);
v15.x = fma (KP923879532, T1w, T1v);
v7.x = fma (-KP923879532, T1w, T1v);
v7.y = fma (-KP923879532, T1q, T1p);
v3.x = fma (KP923879532, T1i, TV);
v3.y = fma (KP923879532, T1u, T1r);
v11.y = fma (-KP923879532, T1u, T1r);
v11.x = fma (-KP923879532, T1i, TV);
T1M = fma (-KP414213562, T1A, T1B);
T1C = fma (KP414213562, T1B, T1A);
T1F = fma (-KP414213562, T1E, T1D);
T1N = fma (KP414213562, T1D, T1E);
T1P = fma (KP707106781, T1I, T1H);
T1J = fma (-KP707106781, T1I, T1H);
T1K = T1F - T1C;
T1G = T1C + T1F;
}
}
}
}
}
T1O = T1M - T1N;
T1Q = T1M + T1N;
v1.x = fma (KP923879532, T1G, T1z);
v1.y = fma (KP923879532, T1Q, T1P);
v9.y = fma (-KP923879532, T1Q, T1P);
v9.x = fma (-KP923879532, T1G, T1z);
v5.y = fma (KP923879532, T1K, T1J);
v5.x = fma (KP923879532, T1O, T1L);
v13.x = fma (-KP923879532, T1O, T1L);
v13.y = fma (-KP923879532, T1K, T1J);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
*u11 = v11;
*u12 = v12;
*u13 = v13;
*u14 = v14;
*u15 = v15;
}
kernel void radix
(
global const float2 * x,
global float2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 16;
x += i + batch_offset;
float2 v0 = x[0 * threads];
float2 v1 = x[1 * threads];
float2 v2 = x[2 * threads];
float2 v3 = x[3 * threads];
float2 v4 = x[4 * threads];
float2 v5 = x[5 * threads];
float2 v6 = x[6 * threads];
float2 v7 = x[7 * threads];
float2 v8 = x[8 * threads];
float2 v9 = x[9 * threads];
float2 v10 = x[10 * threads];
float2 v11 = x[11 * threads];
float2 v12 = x[12 * threads];
float2 v13 = x[13 * threads];
float2 v14 = x[14 * threads];
float2 v15 = x[15 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((float)-0.3926990926265717 * k / p));
v2 = mul(v2, twiddle((float)-0.7853981852531433 * k / p));
v3 = mul(v3, twiddle((float)-1.178097248077393 * k / p));
v4 = mul(v4, twiddle((float)-1.570796370506287 * k / p));
v5 = mul(v5, twiddle((float)-1.963495492935181 * k / p));
v6 = mul(v6, twiddle((float)-2.356194496154785 * k / p));
v7 = mul(v7, twiddle((float)-2.748893737792969 * k / p));
v8 = mul(v8, twiddle((float)-3.141592741012573 * k / p));
v9 = mul(v9, twiddle((float)-3.534291744232178 * k / p));
v10 = mul(v10, twiddle((float)-3.926990985870361 * k / p));
v11 = mul(v11, twiddle((float)-4.319690227508545 * k / p));
v12 = mul(v12, twiddle((float)-4.71238899230957 * k / p));
v13 = mul(v13, twiddle((float)-5.105088233947754 * k / p));
v14 = mul(v14, twiddle((float)-5.497787475585938 * k / p));
v15 = mul(v15, twiddle((float)-5.890486240386963 * k / p));
}
dft16(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15);
const size_t j = k + (i - k) * 16;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
y[11 * p] = v11;
y[12 * p] = v12;
y[13 * p] = v13;
y[14 * p] = v14;
y[15 * p] = v15;
}
#define DEVICE
typedef float real_t;
typedef float2 real2_t;
float2 mul
(
float2 a,
float2 b
)
{
float2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
float2 twiddle
(
float alpha
)
{
float2 r = {native_cos(alpha), native_sin(alpha)};
return r;
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 52 FP additions, 8 FP multiplications,
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
* 43 stack variables, 1 constants, and 32 memory accesses
*/
DEVICE void
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
{
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t TF, TE, TD, TI;
{
real_t TB, T3, Tn, Ti, To, T6, TC, Tl, Td, TN, Tz, TH, Ta, Tq,
Tt;
real_t TM;
{
real_t Tj, T4, T5, Tk;
{
real_t Tg, T1, T2, Th;
Tg = v0.y;
T1 = v0.x;
T2 = v4.x;
Th = v4.y;
Tj = v2.y;
T4 = v2.x;
TB = T1 - T2;
T3 = T1 + T2;
Tn = Tg - Th;
Ti = Tg + Th;
T5 = v6.x;
Tk = v6.y;
}
{
real_t Tw, Tb, Tc, Tx;
Tw = v7.y;
Tb = v7.x;
To = T4 - T5;
T6 = T4 + T5;
TC = Tj - Tk;
Tl = Tj + Tk;
Tc = v3.x;
Tx = v3.y;
{
real_t Tr, T8, Tv, Ty, T9, Ts;
Tr = v1.y;
T8 = v1.x;
Td = Tb + Tc;
Tv = Tb - Tc;
TN = Tw + Tx;
Ty = Tw - Tx;
T9 = v5.x;
Ts = v5.y;
Tz = Tv + Ty;
TH = Ty - Tv;
Ta = T8 + T9;
Tq = T8 - T9;
Tt = Tr - Ts;
TM = Tr + Ts;
}
}
}
{
real_t TP, TG, Tu, Tf, Tm, TQ;
{
real_t T7, Te, TL, TO;
TP = T3 - T6;
T7 = T3 + T6;
TG = Tq + Tt;
Tu = Tq - Tt;
Te = Ta + Td;
Tf = Ta - Td;
Tm = Ti - Tl;
TL = Ti + Tl;
TO = TM + TN;
TQ = TN - TM;
v0.x = T7 + Te;
v0.y = TL + TO;
v4.y = TL - TO;
v4.x = T7 - Te;
}
{
real_t Tp, TA, TJ, TK;
TF = To + Tn;
Tp = Tn - To;
v2.y = Tf + Tm;
v2.x = TP + TQ;
v6.x = TP - TQ;
v6.y = Tm - Tf;
TA = Tu - Tz;
TE = Tu + Tz;
TD = TB - TC;
TJ = TB + TC;
TK = TH - TG;
TI = TG + TH;
v3.y = fma (KP707106781, TA, Tp);
v3.x = fma (KP707106781, TK, TJ);
v7.x = fma (-KP707106781, TK, TJ);
v7.y = fma (-KP707106781, TA, Tp);
}
}
}
v1.x = fma (KP707106781, TE, TD);
v1.y = fma (KP707106781, TI, TF);
v5.y = fma (-KP707106781, TI, TF);
v5.x = fma (-KP707106781, TE, TD);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
}
kernel void radix
(
global const float2 * x,
global float2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 8;
x += i + batch_offset;
float2 v0 = x[0 * threads];
float2 v1 = x[1 * threads];
float2 v2 = x[2 * threads];
float2 v3 = x[3 * threads];
float2 v4 = x[4 * threads];
float2 v5 = x[5 * threads];
float2 v6 = x[6 * threads];
float2 v7 = x[7 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p));
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p));
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p));
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p));
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p));
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p));
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p));
}
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7);
const size_t j = k + (i - k) * 8;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
}
#define DEVICE
typedef float real_t;
typedef float2 real2_t;
float2 mul
(
float2 a,
float2 b
)
{
float2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
float2 twiddle
(
float alpha
)
{
float2 r = {native_cos(alpha), native_sin(alpha)};
return r;
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 52 FP additions, 8 FP multiplications,
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
* 43 stack variables, 1 constants, and 32 memory accesses
*/
DEVICE void
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
{
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t TF, TE, TD, TI;
{
real_t TB, T3, Tn, Ti, To, T6, TC, Tl, Td, TN, Tz, TH, Ta, Tq,
Tt;
real_t TM;
{
real_t Tj, T4, T5, Tk;
{
real_t Tg, T1, T2, Th;
Tg = v0.y;
T1 = v0.x;
T2 = v4.x;
Th = v4.y;
Tj = v2.y;
T4 = v2.x;
TB = T1 - T2;
T3 = T1 + T2;
Tn = Tg - Th;
Ti = Tg + Th;
T5 = v6.x;
Tk = v6.y;
}
{
real_t Tw, Tb, Tc, Tx;
Tw = v7.y;
Tb = v7.x;
To = T4 - T5;
T6 = T4 + T5;
TC = Tj - Tk;
Tl = Tj + Tk;
Tc = v3.x;
Tx = v3.y;
{
real_t Tr, T8, Tv, Ty, T9, Ts;
Tr = v1.y;
T8 = v1.x;
Td = Tb + Tc;
Tv = Tb - Tc;
TN = Tw + Tx;
Ty = Tw - Tx;
T9 = v5.x;
Ts = v5.y;
Tz = Tv + Ty;
TH = Ty - Tv;
Ta = T8 + T9;
Tq = T8 - T9;
Tt = Tr - Ts;
TM = Tr + Ts;
}
}
}
{
real_t TP, TG, Tu, Tf, Tm, TQ;
{
real_t T7, Te, TL, TO;
TP = T3 - T6;
T7 = T3 + T6;
TG = Tq + Tt;
Tu = Tq - Tt;
Te = Ta + Td;
Tf = Ta - Td;
Tm = Ti - Tl;
TL = Ti + Tl;
TO = TM + TN;
TQ = TN - TM;
v0.x = T7 + Te;
v0.y = TL + TO;
v4.y = TL - TO;
v4.x = T7 - Te;
}
{
real_t Tp, TA, TJ, TK;
TF = To + Tn;
Tp = Tn - To;
v2.y = Tf + Tm;
v2.x = TP + TQ;
v6.x = TP - TQ;
v6.y = Tm - Tf;
TA = Tu - Tz;
TE = Tu + Tz;
TD = TB - TC;
TJ = TB + TC;
TK = TH - TG;
TI = TG + TH;
v3.y = fma (KP707106781, TA, Tp);
v3.x = fma (KP707106781, TK, TJ);
v7.x = fma (-KP707106781, TK, TJ);
v7.y = fma (-KP707106781, TA, Tp);
}
}
}
v1.x = fma (KP707106781, TE, TD);
v1.y = fma (KP707106781, TI, TF);
v5.y = fma (-KP707106781, TI, TF);
v5.x = fma (-KP707106781, TE, TD);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
}
kernel void radix
(
global const float2 * x,
global float2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 8;
x += i + batch_offset;
float2 v0 = x[0 * threads];
float2 v1 = x[1 * threads];
float2 v2 = x[2 * threads];
float2 v3 = x[3 * threads];
float2 v4 = x[4 * threads];
float2 v5 = x[5 * threads];
float2 v6 = x[6 * threads];
float2 v7 = x[7 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p));
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p));
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p));
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p));
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p));
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p));
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p));
}
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7);
const size_t j = k + (i - k) * 8;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
}
#define DEVICE
typedef float real_t;
typedef float2 real2_t;
float2 mul
(
float2 a,
float2 b
)
{
float2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
float2 twiddle
(
float alpha
)
{
float2 r = {native_cos(alpha), native_sin(alpha)};
return r;
}/* Generated by: ./cl_gen_notw.native -n 16 -name dft16 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 144 FP additions, 40 FP multiplications,
* (or, 104 additions, 0 multiplications, 40 fused multiply/add),
* 97 stack variables, 3 constants, and 64 memory accesses
*/
DEVICE void
dft16 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10, real2_t * u11, real2_t * u12, real2_t * u13,
real2_t * u14, real2_t * u15)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
real2_t v11 = *u11;
real2_t v12 = *u12;
real2_t v13 = *u13;
real2_t v14 = *u14;
real2_t v15 = *u15;
{
const real_t KP923879532 =
+0.923879532511286756128183189396788286822416626;
const real_t KP414213562 =
+0.414213562373095048801688724209698078569671875;
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t T1z, T1P, T1N, T1M, T1L, T1J, T1K, T1G, T1O, T1Q;
{
real_t T1H, TN, T25, T7, T1l, T1x, TC, T1R, T1A, T1b, T1Z, Tt,
T2h, T22, T1B;
real_t T1g, T1m, TQ, Te, T1S, T26, TJ, TT, T1n, T12, Tj, T11,
Ti, T1V, TZ;
real_t Tk, T13;
{
real_t T1d, Tq, T1c, Tp, T20, T1a, Tr, T1e;
{
real_t Tz, T4, T1j, T3, TM, Ty, T5, TA;
{
real_t Tw, T1, T2, Tx;
Tw = v0.y;
T1 = v0.x;
T2 = v8.x;
Tx = v8.y;
Tz = v4.y;
T4 = v4.x;
T1j = T1 - T2;
T3 = T1 + T2;
TM = Tw - Tx;
Ty = Tw + Tx;
T5 = v12.x;
TA = v12.y;
}
{
real_t T18, Tn, To, T19;
T18 = v15.y;
Tn = v15.x;
{
real_t TL, T6, T1k, TB;
TL = T4 - T5;
T6 = T4 + T5;
T1k = Tz - TA;
TB = Tz + TA;
T1H = TM - TL;
TN = TL + TM;
T25 = T3 - T6;
T7 = T3 + T6;
T1l = T1j - T1k;
T1x = T1j + T1k;
TC = Ty + TB;
T1R = Ty - TB;
To = v7.x;
T19 = v7.y;
}
T1d = v3.y;
Tq = v3.x;
T1c = Tn - To;
Tp = Tn + To;
T20 = T18 + T19;
T1a = T18 - T19;
Tr = v11.x;
T1e = v11.y;
}
}
{
real_t TG, Tb, TO, Ta, TP, TF, Tc, TH;
{
real_t TD, T8, T9, TE;
TD = v2.y;
T8 = v2.x;
{
real_t T17, Ts, T21, T1f;
T17 = Tq - Tr;
Ts = Tq + Tr;
T21 = T1d + T1e;
T1f = T1d - T1e;
T1A = T1a - T17;
T1b = T17 + T1a;
T1Z = Tp - Ts;
Tt = Tp + Ts;
T2h = T20 + T21;
T22 = T20 - T21;
T1B = T1c + T1f;
T1g = T1c - T1f;
T9 = v10.x;
TE = v10.y;
}
TG = v14.y;
Tb = v14.x;
TO = T8 - T9;
Ta = T8 + T9;
TP = TD - TE;
TF = TD + TE;
Tc = v6.x;
TH = v6.y;
}
{
real_t TS, Td, TR, TI;
T1m = TO - TP;
TQ = TO + TP;
TS = Tb - Tc;
Td = Tb + Tc;
TR = TG - TH;
TI = TG + TH;
Te = Ta + Td;
T1S = Ta - Td;
T26 = TI - TF;
TJ = TF + TI;
TT = TR - TS;
T1n = TS + TR;
}
}
{
real_t TX, Tg, Th, TY;
TX = v1.y;
Tg = v1.x;
Th = v9.x;
TY = v9.y;
T12 = v5.y;
Tj = v5.x;
T11 = Tg - Th;
Ti = Tg + Th;
T1V = TX + TY;
TZ = TX - TY;
Tk = v13.x;
T13 = v13.y;
}
}
{
real_t T2j, T1D, T10, T1U, T1X, T1E, T15, Tv, TK, T2k;
{
real_t Tf, Tu, T2f, T2i, T2g;
T2j = T7 - Te;
Tf = T7 + Te;
{
real_t TW, Tl, T1W, T14, Tm;
TW = Tj - Tk;
Tl = Tj + Tk;
T1W = T12 + T13;
T14 = T12 - T13;
T1D = TZ - TW;
T10 = TW + TZ;
T1U = Ti - Tl;
Tm = Ti + Tl;
T2g = T1V + T1W;
T1X = T1V - T1W;
T1E = T11 + T14;
T15 = T11 - T14;
Tu = Tm + Tt;
Tv = Tm - Tt;
}
TK = TC - TJ;
T2f = TC + TJ;
T2i = T2g + T2h;
T2k = T2h - T2g;
v0.x = Tf + Tu;
v0.y = T2f + T2i;
v8.y = T2f - T2i;
v8.x = Tf - Tu;
}
{
real_t T29, T1T, T27, T2d, T2a, T2b, T28, T24, T1Y,
T23;
T29 = T1S + T1R;
T1T = T1R - T1S;
v4.y = Tv + TK;
v4.x = T2j + T2k;
v12.x = T2j - T2k;
v12.y = TK - Tv;
T27 = T25 + T26;
T2d = T25 - T26;
T2a = T1U + T1X;
T1Y = T1U - T1X;
T23 = T1Z + T22;
T2b = T22 - T1Z;
T28 = T1Y + T23;
T24 = T1Y - T23;
{
real_t T1y, TV, T1r, T1I, T1s, T1t, T1v, T1p, T1q,
T1i;
{
real_t T1o, T2e, T2c, TU, T16, T1h;
T1y = TT - TQ;
TU = TQ + TT;
T2e = T2b - T2a;
T2c = T2a + T2b;
TV = fma (KP707106781, TU, TN);
T1r = fma (-KP707106781, TU, TN);
v14.y = fma (-KP707106781, T24, T1T);
v14.x = fma (-KP707106781, T2e, T2d);
v6.x = fma (KP707106781, T2e, T2d);
v6.y = fma (KP707106781, T24, T1T);
v2.x = fma (KP707106781, T28, T27);
v2.y = fma (KP707106781, T2c, T29);
v10.y = fma (-KP707106781, T2c, T29);
v10.x = fma (-KP707106781, T28, T27);
T1o = T1m + T1n;
T1I = T1m - T1n;
T1s = fma (-KP414213562, T10, T15);
T16 = fma (KP414213562, T15, T10);
T1h = fma (-KP414213562, T1g, T1b);
T1t = fma (KP414213562, T1b, T1g);
T1v = fma (KP707106781, T1o, T1l);
T1p = fma (-KP707106781, T1o, T1l);
T1q = T1h - T16;
T1i = T16 + T1h;
}
{
real_t T1w, T1u, T1C, T1F;
T1w = T1s + T1t;
T1u = T1s - T1t;
T1z = fma (KP707106781, T1y, T1x);
T1P = fma (-KP707106781, T1y, T1x);
v1.y = fma (KP923879532, T1i, TV);
v1.x = fma (KP923879532, T1w, T1v);
v9.x = fma (-KP923879532, T1w, T1v);
v9.y = fma (-KP923879532, T1i, TV);
v5.x = fma (KP923879532, T1q, T1p);
v5.y = fma (KP923879532, T1u, T1r);
v13.y = fma (-KP923879532, T1u, T1r);
v13.x = fma (-KP923879532, T1q, T1p);
T1N = fma (-KP414213562, T1A, T1B);
T1C = fma (KP414213562, T1B, T1A);
T1F = fma (-KP414213562, T1E, T1D);
T1M = fma (KP414213562, T1D, T1E);
T1L = fma (KP707106781, T1I, T1H);
T1J = fma (-KP707106781, T1I, T1H);
T1K = T1F + T1C;
T1G = T1C - T1F;
}
}
}
}
}
T1O = T1M - T1N;
T1Q = T1M + T1N;
v15.y = fma (KP923879532, T1K, T1J);
v15.x = fma (KP923879532, T1Q, T1P);
v7.x = fma (-KP923879532, T1Q, T1P);
v7.y = fma (-KP923879532, T1K, T1J);
v3.x = fma (KP923879532, T1G, T1z);
v3.y = fma (KP923879532, T1O, T1L);
v11.y = fma (-KP923879532, T1O, T1L);
v11.x = fma (-KP923879532, T1G, T1z);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
*u11 = v11;
*u12 = v12;
*u13 = v13;
*u14 = v14;
*u15 = v15;
}
kernel void radix
(
global const float2 * x,
global float2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 16;
x += i + batch_offset;
float2 v0 = x[0 * threads];
float2 v1 = x[1 * threads];
float2 v2 = x[2 * threads];
float2 v3 = x[3 * threads];
float2 v4 = x[4 * threads];
float2 v5 = x[5 * threads];
float2 v6 = x[6 * threads];
float2 v7 = x[7 * threads];
float2 v8 = x[8 * threads];
float2 v9 = x[9 * threads];
float2 v10 = x[10 * threads];
float2 v11 = x[11 * threads];
float2 v12 = x[12 * threads];
float2 v13 = x[13 * threads];
float2 v14 = x[14 * threads];
float2 v15 = x[15 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((float)-0.3926990926265717 * k / p));
v2 = mul(v2, twiddle((float)-0.7853981852531433 * k / p));
v3 = mul(v3, twiddle((float)-1.178097248077393 * k / p));
v4 = mul(v4, twiddle((float)-1.570796370506287 * k / p));
v5 = mul(v5, twiddle((float)-1.963495492935181 * k / p));
v6 = mul(v6, twiddle((float)-2.356194496154785 * k / p));
v7 = mul(v7, twiddle((float)-2.748893737792969 * k / p));
v8 = mul(v8, twiddle((float)-3.141592741012573 * k / p));
v9 = mul(v9, twiddle((float)-3.534291744232178 * k / p));
v10 = mul(v10, twiddle((float)-3.926990985870361 * k / p));
v11 = mul(v11, twiddle((float)-4.319690227508545 * k / p));
v12 = mul(v12, twiddle((float)-4.71238899230957 * k / p));
v13 = mul(v13, twiddle((float)-5.105088233947754 * k / p));
v14 = mul(v14, twiddle((float)-5.497787475585938 * k / p));
v15 = mul(v15, twiddle((float)-5.890486240386963 * k / p));
}
dft16(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15);
const size_t j = k + (i - k) * 16;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
y[11 * p] = v11;
y[12 * p] = v12;
y[13 * p] = v13;
y[14 * p] = v14;
y[15 * p] = v15;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
float2 r2c
(
float v
)
{
float2 r = {v, 0}; return r;
}
kernel void vexcl_vector_kernel
(
ulong n,
global float2 * prm_1,
global float * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = r2c( prm_2[idx] );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
float2 scl
(
float2 v,
float s
)
{
v.x *= s; v.y *= s; return v;
}
kernel void vexcl_vector_kernel
(
ulong n,
global float2 * prm_1,
global float2 * prm_2,
float prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = scl( prm_2[idx], prm_3 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global float2 * prm_1,
global float2 * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2[idx];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
float c2r
(
float2 v
)
{
return v.x;
}
kernel void vexcl_vector_kernel
(
ulong n,
global float * prm_1,
float prm_2,
global float2 * prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = ( prm_2 * c2r( prm_3[idx] ) );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
float SUM_float
(
float prm1,
float prm2
)
{
return prm1 + prm2;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global float * prm_1,
global float * prm_2,
float prm_3,
global float * g_odata
)
{
float mySum = (float)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_float(mySum, pow( ( prm_1[idx] - prm_2[idx] ), prm_3 ));
}
g_odata[get_group_id(0)] = mySum;
}
FFT(C2C) size=11 batch=1
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 11 -name dft11 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 140 FP additions, 110 FP multiplications,
* (or, 30 additions, 0 multiplications, 110 fused multiply/add),
* 96 stack variables, 10 constants, and 44 memory accesses
*/
DEVICE void
dft11 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
{
const real_t KP989821441 =
+0.989821441880932732376092037776718787376519372;
const real_t KP959492973 =
+0.959492973614497389890368057066327699062454848;
const real_t KP918985947 =
+0.918985947228994779780736114132655398124909697;
const real_t KP876768831 =
+0.876768831002589333891339807079336796764054852;
const real_t KP830830026 =
+0.830830026003772851058548298459246407048009821;
const real_t KP778434453 =
+0.778434453334651800608337670740821884709317477;
const real_t KP715370323 =
+0.715370323453429719112414662767260662417897278;
const real_t KP634356270 =
+0.634356270682424498893150776899916060542806975;
const real_t KP342584725 =
+0.342584725681637509502641509861112333758894680;
const real_t KP521108558 =
+0.521108558113202722944698153526659300680427422;
{
real_t T1, TA, T1p, T1y, T19, T1d, T1a, T1e;
{
real_t T1f, T1u, T4, T1q, Tg, T1t, T7, T1s, Ta, Td, T1r, TP,
T1X, T26, Ti;
real_t TG, T1O, T1w, TY, T1F, T17, To, T1i, T1k, T1h, Tr, T1j,
Tu, T1g, Tx;
real_t T21, TU, TL, TC, T1S, T1J, T1m, T12, T1z, T1b;
T1f = v0.y;
T1 = v0.x;
{
real_t Tv, Tw, Ty, Tz, Tp, Tq, Tm, Tn, Ts, Tt, T1E, T16,
Tb, Tc;
{
real_t T2, T3, Te, Tf;
Tv = v1.y;
T2 = v1.x;
T3 = v10.x;
Tw = v10.y;
Ty = v5.y;
Te = v5.x;
Tf = v6.x;
Tz = v6.y;
{
real_t T5, T6, T8, T9;
Tp = v2.y;
T5 = v2.x;
T1u = T3 - T2;
T4 = T2 + T3;
T1q = Tf - Te;
Tg = Te + Tf;
T6 = v9.x;
Tq = v9.y;
Tm = v3.y;
T8 = v3.x;
T9 = v8.x;
Tn = v8.y;
Ts = v4.y;
Tb = v4.x;
T1t = T6 - T5;
T7 = T5 + T6;
T1s = T9 - T8;
Ta = T8 + T9;
Tc = v7.x;
Tt = v7.y;
}
}
{
real_t T25, Th, T1W, TO;
T25 = fma (KP521108558, T1q, T1u);
T1W = fma (KP521108558, T1s, T1q);
TO = fma (-KP342584725, T4, Ta);
Th = fma (-KP342584725, Ta, T7);
Td = Tb + Tc;
T1r = Tc - Tb;
TP = fma (-KP634356270, TO, Tg);
T1X = fma (-KP715370323, T1W, T1t);
T26 = fma (KP715370323, T25, T1r);
{
real_t TF, T1N, T1v, TX;
TF = fma (-KP342584725, Td, T4);
Ti = fma (-KP634356270, Th, Td);
T1N = fma (-KP521108558, T1t, T1r);
T1v = fma (-KP521108558, T1u, T1t);
TG = fma (-KP634356270, TF, T7);
TX = fma (-KP342584725, T7, Tg);
T1O = fma (KP715370323, T1N, T1q);
T1w = fma (-KP715370323, T1v, T1s);
T1E = fma (KP521108558, T1r, T1s);
TY = fma (-KP634356270, TX, T4);
T16 = fma (-KP342584725, Tg, Td);
}
}
T1F = fma (KP715370323, T1E, T1u);
T17 = fma (-KP634356270, T16, Ta);
To = Tm - Tn;
T1i = Tm + Tn;
T1k = Ty + Tz;
TA = Ty - Tz;
T1h = Tp + Tq;
Tr = Tp - Tq;
T1j = Ts + Tt;
Tu = Ts - Tt;
{
real_t TB, T1R, T20, TK, TT, T1I, T1l;
T20 = fma (-KP342584725, T1i, T1h);
TK = fma (KP521108558, To, TA);
TT = fma (-KP521108558, Tr, Tu);
T1g = Tv + Tw;
Tx = Tv - Tw;
T21 = fma (-KP634356270, T20, T1j);
TU = fma (KP715370323, TT, TA);
TL = fma (-KP715370323, TK, Tr);
TB = fma (KP521108558, TA, Tx);
T1R = fma (-KP342584725, T1j, T1g);
T1I = fma (-KP342584725, T1g, T1i);
T1l = fma (-KP342584725, T1k, T1j);
TC = fma (KP715370323, TB, Tu);
T1S = fma (-KP634356270, T1R, T1h);
T1J = fma (-KP634356270, T1I, T1k);
T1m = fma (-KP634356270, T1l, T1i);
T12 = fma (KP521108558, Tu, To);
T1z = fma (-KP342584725, T1h, T1k);
T1b = fma (-KP521108558, Tx, Tr);
}
}
{
real_t T13, T1A, T1c, T1Z, T1V, TH, TM, Tj, TD;
T13 = fma (KP715370323, T12, Tx);
T1A = fma (-KP634356270, T1z, T1g);
T1c = fma (-KP715370323, T1b, To);
v0.y = T1f + T1g + T1h + T1i + T1j + T1k;
v0.x = T1 + T4 + T7 + Ta + Td + Tg;
Tj = fma (-KP778434453, Ti, T4);
TD = fma (KP830830026, TC, Tr);
{
real_t TE, T23, T28, Tl, Tk, T22, T27;
T22 = fma (-KP778434453, T21, T1g);
T27 = fma (KP830830026, T26, T1t);
Tk = fma (-KP876768831, Tj, Tg);
TE = fma (KP918985947, TD, To);
T23 = fma (-KP876768831, T22, T1k);
T28 = fma (KP918985947, T27, T1s);
Tl = fma (-KP959492973, Tk, T1);
{
real_t T1U, T1T, T24, T1Y;
T1T = fma (-KP778434453, T1S, T1k);
T24 = fma (-KP959492973, T23, T1f);
T1Y = fma (KP830830026, T1X, T1u);
T1U = fma (-KP876768831, T1T, T1i);
v10.y = fma (-KP989821441, T28, T24);
v10.x = fma (-KP989821441, TE, Tl);
v1.x = fma (KP989821441, TE, Tl);
v1.y = fma (KP989821441, T28, T24);
T1Z = fma (-KP918985947, T1Y, T1r);
T1V = fma (-KP959492973, T1U, T1f);
}
TH = fma (-KP778434453, TG, Tg);
TM = fma (KP830830026, TL, Tx);
}
{
real_t TS, TW, T1M, TZ, T14, T1Q;
{
real_t TN, TR, TV, TJ, TI, TQ, T1P, T1L, T1K;
TQ = fma (-KP778434453, TP, Td);
TI = fma (-KP876768831, TH, Ta);
TN = fma (-KP918985947, TM, Tu);
TR = fma (-KP876768831, TQ, T7);
TV = fma (-KP830830026, TU, To);
TJ = fma (-KP959492973, TI, T1);
T1K = fma (-KP778434453, T1J, T1j);
TS = fma (-KP959492973, TR, T1);
TW = fma (-KP918985947, TV, Tx);
v9.y = fma (KP989821441, T1Z, T1V);
v9.x = fma (KP989821441, TN, TJ);
v2.x = fma (-KP989821441, TN, TJ);
v2.y = fma (-KP989821441, T1Z, T1V);
T1L = fma (-KP876768831, T1K, T1h);
T1P = fma (-KP830830026, T1O, T1s);
T1M = fma (-KP959492973, T1L, T1f);
TZ = fma (-KP778434453, TY, Ta);
T14 = fma (-KP830830026, T13, TA);
T1Q = fma (-KP918985947, T1P, T1u);
}
{
real_t T15, T11, T1C, T1G, T1B, T10;
T1B = fma (-KP778434453, T1A, T1i);
T10 = fma (-KP876768831, TZ, Td);
T15 = fma (KP918985947, T14, Tr);
v8.y = fma (-KP989821441, T1Q, T1M);
v8.x = fma (-KP989821441, TW, TS);
v3.x = fma (KP989821441, TW, TS);
v3.y = fma (KP989821441, T1Q, T1M);
T11 = fma (-KP959492973, T10, T1);
T1C = fma (-KP876768831, T1B, T1j);
T1G = fma (-KP830830026, T1F, T1q);
{
real_t T1D, T1H, T1o, T1x, T1n, T18;
T1n = fma (-KP778434453, T1m, T1h);
T1D = fma (-KP959492973, T1C, T1f);
T1H = fma (KP918985947, T1G, T1t);
T1o = fma (-KP876768831, T1n, T1g);
T1x = fma (-KP830830026, T1w, T1r);
T18 = fma (-KP778434453, T17, T7);
v7.x = fma (KP989821441, T15, T11);
v7.y = fma (KP989821441, T1H, T1D);
v4.y = fma (-KP989821441, T1H, T1D);
v4.x = fma (-KP989821441, T15, T11);
T1p = fma (-KP959492973, T1o, T1f);
T1y = fma (-KP918985947, T1x, T1q);
T19 = fma (-KP876768831, T18, T4);
T1d = fma (-KP830830026, T1c, Tu);
}
}
}
}
}
T1a = fma (-KP959492973, T19, T1);
T1e = fma (-KP918985947, T1d, TA);
v5.y = fma (KP989821441, T1y, T1p);
v5.x = fma (KP989821441, T1e, T1a);
v6.x = fma (-KP989821441, T1e, T1a);
v6.y = fma (-KP989821441, T1y, T1p);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 11;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
double2 v9 = x[9 * threads];
double2 v10 = x[10 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.5711986642890533 * k / p));
v2 = mul(v2, twiddle((double)-1.142397328578107 * k / p));
v3 = mul(v3, twiddle((double)-1.71359599286716 * k / p));
v4 = mul(v4, twiddle((double)-2.284794657156213 * k / p));
v5 = mul(v5, twiddle((double)-2.855993321445267 * k / p));
v6 = mul(v6, twiddle((double)-3.42719198573432 * k / p));
v7 = mul(v7, twiddle((double)-3.998390650023373 * k / p));
v8 = mul(v8, twiddle((double)-4.569589314312426 * k / p));
v9 = mul(v9, twiddle((double)-5.140787978601479 * k / p));
v10 = mul(v10, twiddle((double)-5.711986642890533 * k / p));
}
dft11(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10);
const size_t j = k + (i - k) * 11;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 11 -name dft11 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 140 FP additions, 110 FP multiplications,
* (or, 30 additions, 0 multiplications, 110 fused multiply/add),
* 96 stack variables, 10 constants, and 44 memory accesses
*/
DEVICE void
dft11 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
{
const real_t KP989821441 =
+0.989821441880932732376092037776718787376519372;
const real_t KP959492973 =
+0.959492973614497389890368057066327699062454848;
const real_t KP918985947 =
+0.918985947228994779780736114132655398124909697;
const real_t KP876768831 =
+0.876768831002589333891339807079336796764054852;
const real_t KP830830026 =
+0.830830026003772851058548298459246407048009821;
const real_t KP778434453 =
+0.778434453334651800608337670740821884709317477;
const real_t KP634356270 =
+0.634356270682424498893150776899916060542806975;
const real_t KP715370323 =
+0.715370323453429719112414662767260662417897278;
const real_t KP342584725 =
+0.342584725681637509502641509861112333758894680;
const real_t KP521108558 =
+0.521108558113202722944698153526659300680427422;
{
real_t Th, TE, T1p, T1y, T1f, T1j, T1g, T1k;
{
real_t T1, TG, T4, TC, Tg, TF, T7, Ta, TD, Td, TI, T1S, T1J,
TR, T10;
real_t T21, T1m, T19, T1A, T1i, T1t, Tk, T1u, Tw, T1r, Tn,
T1q, Tq, T1s, Tt;
real_t T26, TV, TM, Ty, T1X, T1O, T1w, T13, T1E, T1c;
Th = v0.y;
T1 = v0.x;
{
real_t Ti, Tj, Tu, Tv, Tl, Tm, To, Tp, Tr, Ts;
{
real_t Tb, Tc, TH, T1R;
{
real_t T2, T3, Te, Tf;
Ti = v1.y;
T2 = v1.x;
T3 = v10.x;
Tj = v10.y;
Tu = v5.y;
Te = v5.x;
Tf = v6.x;
Tv = v6.y;
{
real_t T5, T6, T8, T9;
Tl = v2.y;
T5 = v2.x;
TG = T2 - T3;
T4 = T2 + T3;
TC = Te - Tf;
Tg = Te + Tf;
T6 = v9.x;
Tm = v9.y;
To = v3.y;
T8 = v3.x;
T9 = v8.x;
Tp = v8.y;
Tr = v4.y;
Tb = v4.x;
TF = T5 - T6;
T7 = T5 + T6;
TE = T8 - T9;
Ta = T8 + T9;
Tc = v7.x;
Ts = v7.y;
}
}
TH = fma (-KP521108558, TG, TF);
T1R = fma (-KP342584725, T7, Tg);
{
real_t T1l, T18, T1z, T1h;
{
real_t TQ, TZ, T20, T1I;
T1I = fma (-KP342584725, T4, Ta);
TD = Tb - Tc;
Td = Tb + Tc;
TI = fma (-KP715370323, TH, TE);
T1S = fma (-KP634356270, T1R, T4);
TQ = fma (KP521108558, TD, TE);
TZ = fma (-KP521108558, TF, TD);
T20 = fma (-KP342584725, Tg, Td);
T1J = fma (-KP634356270, T1I, Tg);
TR = fma (KP715370323, TQ, TG);
T10 = fma (KP715370323, TZ, TC);
T21 = fma (-KP634356270, T20, Ta);
T1l = fma (-KP342584725, Ta, T7);
}
T18 = fma (KP521108558, TE, TC);
T1z = fma (-KP342584725, Td, T4);
T1h = fma (KP521108558, TC, TG);
T1m = fma (-KP634356270, T1l, Td);
T19 = fma (-KP715370323, T18, TF);
T1A = fma (-KP634356270, T1z, T7);
T1i = fma (KP715370323, T1h, TD);
}
}
T1t = Tj - Ti;
Tk = Ti + Tj;
T1u = Tv - Tu;
Tw = Tu + Tv;
T1r = Tm - Tl;
Tn = Tl + Tm;
T1q = Tp - To;
Tq = To + Tp;
{
real_t Tx, T1W, T25, TL, TU, T1N, T1v;
T25 = fma (-KP521108558, T1t, T1r);
TL = fma (-KP342584725, Tn, Tw);
TU = fma (-KP342584725, Tk, Tq);
T1s = Ts - Tr;
Tt = Tr + Ts;
T26 = fma (-KP715370323, T25, T1q);
TV = fma (-KP634356270, TU, Tw);
TM = fma (-KP634356270, TL, Tk);
Tx = fma (-KP342584725, Tw, Tt);
T1W = fma (KP521108558, T1s, T1q);
T1N = fma (-KP521108558, T1r, T1s);
T1v = fma (KP521108558, T1u, T1t);
Ty = fma (-KP634356270, Tx, Tq);
T1X = fma (KP715370323, T1W, T1t);
T1O = fma (KP715370323, T1N, T1u);
T1w = fma (KP715370323, T1v, T1s);
T13 = fma (-KP342584725, Tt, Tk);
T1E = fma (KP521108558, T1q, T1u);
T1c = fma (-KP342584725, Tq, Tn);
}
}
{
real_t T14, T1F, T1d, T1Z, T1V, TN, TS, Tz, TJ;
T14 = fma (-KP634356270, T13, Tn);
T1F = fma (-KP715370323, T1E, T1r);
T1d = fma (-KP634356270, T1c, Tt);
v0.y = Th + Tk + Tn + Tq + Tt + Tw;
v0.x = T1 + T4 + T7 + Ta + Td + Tg;
Tz = fma (-KP778434453, Ty, Tn);
TJ = fma (-KP830830026, TI, TD);
{
real_t TK, T23, T28, TB, TA, T22, T27;
T22 = fma (-KP778434453, T21, T7);
T27 = fma (-KP830830026, T26, T1s);
TA = fma (-KP876768831, Tz, Tk);
TK = fma (-KP918985947, TJ, TC);
T23 = fma (-KP876768831, T22, T4);
T28 = fma (-KP918985947, T27, T1u);
TB = fma (-KP959492973, TA, Th);
{
real_t T1U, T1T, T24, T1Y;
T1T = fma (-KP778434453, T1S, Ta);
T24 = fma (-KP959492973, T23, T1);
T1Y = fma (-KP830830026, T1X, T1u);
T1U = fma (-KP876768831, T1T, Td);
v5.x = fma (KP989821441, T28, T24);
v5.y = fma (KP989821441, TK, TB);
v6.y = fma (-KP989821441, TK, TB);
v6.x = fma (-KP989821441, T28, T24);
T1Z = fma (KP918985947, T1Y, T1r);
T1V = fma (-KP959492973, T1U, T1);
}
TN = fma (-KP778434453, TM, Tq);
TS = fma (-KP830830026, TR, TC);
}
{
real_t TY, T12, T1M, T15, T1a, T1Q;
{
real_t TT, TX, T11, TP, TO, TW, T1P, T1L, T1K;
TW = fma (-KP778434453, TV, Tt);
TO = fma (-KP876768831, TN, Tt);
TT = fma (KP918985947, TS, TF);
TX = fma (-KP876768831, TW, Tn);
T11 = fma (-KP830830026, T10, TE);
TP = fma (-KP959492973, TO, Th);
T1K = fma (-KP778434453, T1J, Td);
TY = fma (-KP959492973, TX, Th);
T12 = fma (-KP918985947, T11, TG);
v7.x = fma (KP989821441, T1Z, T1V);
v7.y = fma (KP989821441, TT, TP);
v4.y = fma (-KP989821441, TT, TP);
v4.x = fma (-KP989821441, T1Z, T1V);
T1L = fma (-KP876768831, T1K, T7);
T1P = fma (-KP830830026, T1O, T1q);
T1M = fma (-KP959492973, T1L, T1);
T15 = fma (-KP778434453, T14, Tw);
T1a = fma (KP830830026, T19, TG);
T1Q = fma (-KP918985947, T1P, T1t);
}
{
real_t T1b, T17, T1C, T1G, T1B, T16;
T1B = fma (-KP778434453, T1A, Tg);
T16 = fma (-KP876768831, T15, Tq);
T1b = fma (-KP918985947, T1a, TD);
v3.x = fma (KP989821441, T1Q, T1M);
v3.y = fma (KP989821441, T12, TY);
v8.y = fma (-KP989821441, T12, TY);
v8.x = fma (-KP989821441, T1Q, T1M);
T17 = fma (-KP959492973, T16, Th);
T1C = fma (-KP876768831, T1B, Ta);
T1G = fma (KP830830026, T1F, T1t);
{
real_t T1D, T1H, T1o, T1x, T1n, T1e;
T1n = fma (-KP778434453, T1m, T4);
T1D = fma (-KP959492973, T1C, T1);
T1H = fma (-KP918985947, T1G, T1s);
T1o = fma (-KP876768831, T1n, Tg);
T1x = fma (KP830830026, T1w, T1r);
T1e = fma (-KP778434453, T1d, Tk);
v9.y = fma (KP989821441, T1b, T17);
v9.x = fma (KP989821441, T1H, T1D);
v2.x = fma (-KP989821441, T1H, T1D);
v2.y = fma (-KP989821441, T1b, T17);
T1p = fma (-KP959492973, T1o, T1);
T1y = fma (KP918985947, T1x, T1q);
T1f = fma (-KP876768831, T1e, Tw);
T1j = fma (KP830830026, T1i, TF);
}
}
}
}
}
T1g = fma (-KP959492973, T1f, Th);
T1k = fma (KP918985947, T1j, TE);
v10.x = fma (-KP989821441, T1y, T1p);
v10.y = fma (-KP989821441, T1k, T1g);
v1.y = fma (KP989821441, T1k, T1g);
v1.x = fma (KP989821441, T1y, T1p);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 11;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
double2 v9 = x[9 * threads];
double2 v10 = x[10 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.5711986642890533 * k / p));
v2 = mul(v2, twiddle((double)-1.142397328578107 * k / p));
v3 = mul(v3, twiddle((double)-1.71359599286716 * k / p));
v4 = mul(v4, twiddle((double)-2.284794657156213 * k / p));
v5 = mul(v5, twiddle((double)-2.855993321445267 * k / p));
v6 = mul(v6, twiddle((double)-3.42719198573432 * k / p));
v7 = mul(v7, twiddle((double)-3.998390650023373 * k / p));
v8 = mul(v8, twiddle((double)-4.569589314312426 * k / p));
v9 = mul(v9, twiddle((double)-5.140787978601479 * k / p));
v10 = mul(v10, twiddle((double)-5.711986642890533 * k / p));
}
dft11(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10);
const size_t j = k + (i - k) * 11;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
kernel void vexcl_vector_kernel
(
ulong n,
global double2 * prm_1,
global double2 * prm_2
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = prm_2[idx];
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double2 scl
(
double2 v,
double s
)
{
v.x *= s; v.y *= s; return v;
}
kernel void vexcl_vector_kernel
(
ulong n,
global double2 * prm_1,
global double2 * prm_2,
double prm_3
)
{
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
prm_1[idx] = scl( prm_2[idx], prm_3 );
}
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double SUM_double
(
double prm1,
double prm2
)
{
return prm1 + prm2;
}
double dot2
(
double2 a,
double2 b
)
{
return a.x * b.x + a.y * b.y;
}
double2 minus
(
double2 a,
double2 b
)
{
double2 r = {a.x - b.x, a.y - b.y}; return r;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double2 * prm_1,
global double2 * prm_2,
global double2 * prm_3,
global double2 * prm_4,
global double * g_odata
)
{
double mySum = (double)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_double(mySum, dot2( minus( prm_1[idx], prm_2[idx] ), minus( prm_3[idx], prm_4[idx] ) ));
}
g_odata[get_group_id(0)] = mySum;
}
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
double SUM_double
(
double prm1,
double prm2
)
{
return prm1 + prm2;
}
double dot2
(
double2 a,
double2 b
)
{
return a.x * b.x + a.y * b.y;
}
kernel void vexcl_reductor_kernel
(
ulong n,
global double2 * prm_1,
global double2 * prm_2,
global double * g_odata
)
{
double mySum = (double)0;
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);
ulong chunk_start = get_global_id(0) * chunk_size;
ulong chunk_end = chunk_start + chunk_size;
if (n < chunk_end) chunk_end = n;
for(ulong idx = chunk_start; idx < chunk_end; ++idx)
{
mySum = SUM_double(mySum, dot2( prm_1[idx], prm_2[idx] ));
}
g_odata[get_group_id(0)] = mySum;
}
FFT(C2C) size=28x11 batch=7
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 11 -name dft11 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 140 FP additions, 110 FP multiplications,
* (or, 30 additions, 0 multiplications, 110 fused multiply/add),
* 96 stack variables, 10 constants, and 44 memory accesses
*/
DEVICE void
dft11 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
{
const real_t KP989821441 =
+0.989821441880932732376092037776718787376519372;
const real_t KP959492973 =
+0.959492973614497389890368057066327699062454848;
const real_t KP918985947 =
+0.918985947228994779780736114132655398124909697;
const real_t KP876768831 =
+0.876768831002589333891339807079336796764054852;
const real_t KP830830026 =
+0.830830026003772851058548298459246407048009821;
const real_t KP778434453 =
+0.778434453334651800608337670740821884709317477;
const real_t KP715370323 =
+0.715370323453429719112414662767260662417897278;
const real_t KP634356270 =
+0.634356270682424498893150776899916060542806975;
const real_t KP342584725 =
+0.342584725681637509502641509861112333758894680;
const real_t KP521108558 =
+0.521108558113202722944698153526659300680427422;
{
real_t T1, TA, T1p, T1y, T19, T1d, T1a, T1e;
{
real_t T1f, T1u, T4, T1q, Tg, T1t, T7, T1s, Ta, Td, T1r, TP,
T1X, T26, Ti;
real_t TG, T1O, T1w, TY, T1F, T17, To, T1i, T1k, T1h, Tr, T1j,
Tu, T1g, Tx;
real_t T21, TU, TL, TC, T1S, T1J, T1m, T12, T1z, T1b;
T1f = v0.y;
T1 = v0.x;
{
real_t Tv, Tw, Ty, Tz, Tp, Tq, Tm, Tn, Ts, Tt, T1E, T16,
Tb, Tc;
{
real_t T2, T3, Te, Tf;
Tv = v1.y;
T2 = v1.x;
T3 = v10.x;
Tw = v10.y;
Ty = v5.y;
Te = v5.x;
Tf = v6.x;
Tz = v6.y;
{
real_t T5, T6, T8, T9;
Tp = v2.y;
T5 = v2.x;
T1u = T3 - T2;
T4 = T2 + T3;
T1q = Tf - Te;
Tg = Te + Tf;
T6 = v9.x;
Tq = v9.y;
Tm = v3.y;
T8 = v3.x;
T9 = v8.x;
Tn = v8.y;
Ts = v4.y;
Tb = v4.x;
T1t = T6 - T5;
T7 = T5 + T6;
T1s = T9 - T8;
Ta = T8 + T9;
Tc = v7.x;
Tt = v7.y;
}
}
{
real_t T25, Th, T1W, TO;
T25 = fma (KP521108558, T1q, T1u);
T1W = fma (KP521108558, T1s, T1q);
TO = fma (-KP342584725, T4, Ta);
Th = fma (-KP342584725, Ta, T7);
Td = Tb + Tc;
T1r = Tc - Tb;
TP = fma (-KP634356270, TO, Tg);
T1X = fma (-KP715370323, T1W, T1t);
T26 = fma (KP715370323, T25, T1r);
{
real_t TF, T1N, T1v, TX;
TF = fma (-KP342584725, Td, T4);
Ti = fma (-KP634356270, Th, Td);
T1N = fma (-KP521108558, T1t, T1r);
T1v = fma (-KP521108558, T1u, T1t);
TG = fma (-KP634356270, TF, T7);
TX = fma (-KP342584725, T7, Tg);
T1O = fma (KP715370323, T1N, T1q);
T1w = fma (-KP715370323, T1v, T1s);
T1E = fma (KP521108558, T1r, T1s);
TY = fma (-KP634356270, TX, T4);
T16 = fma (-KP342584725, Tg, Td);
}
}
T1F = fma (KP715370323, T1E, T1u);
T17 = fma (-KP634356270, T16, Ta);
To = Tm - Tn;
T1i = Tm + Tn;
T1k = Ty + Tz;
TA = Ty - Tz;
T1h = Tp + Tq;
Tr = Tp - Tq;
T1j = Ts + Tt;
Tu = Ts - Tt;
{
real_t TB, T1R, T20, TK, TT, T1I, T1l;
T20 = fma (-KP342584725, T1i, T1h);
TK = fma (KP521108558, To, TA);
TT = fma (-KP521108558, Tr, Tu);
T1g = Tv + Tw;
Tx = Tv - Tw;
T21 = fma (-KP634356270, T20, T1j);
TU = fma (KP715370323, TT, TA);
TL = fma (-KP715370323, TK, Tr);
TB = fma (KP521108558, TA, Tx);
T1R = fma (-KP342584725, T1j, T1g);
T1I = fma (-KP342584725, T1g, T1i);
T1l = fma (-KP342584725, T1k, T1j);
TC = fma (KP715370323, TB, Tu);
T1S = fma (-KP634356270, T1R, T1h);
T1J = fma (-KP634356270, T1I, T1k);
T1m = fma (-KP634356270, T1l, T1i);
T12 = fma (KP521108558, Tu, To);
T1z = fma (-KP342584725, T1h, T1k);
T1b = fma (-KP521108558, Tx, Tr);
}
}
{
real_t T13, T1A, T1c, T1Z, T1V, TH, TM, Tj, TD;
T13 = fma (KP715370323, T12, Tx);
T1A = fma (-KP634356270, T1z, T1g);
T1c = fma (-KP715370323, T1b, To);
v0.y = T1f + T1g + T1h + T1i + T1j + T1k;
v0.x = T1 + T4 + T7 + Ta + Td + Tg;
Tj = fma (-KP778434453, Ti, T4);
TD = fma (KP830830026, TC, Tr);
{
real_t TE, T23, T28, Tl, Tk, T22, T27;
T22 = fma (-KP778434453, T21, T1g);
T27 = fma (KP830830026, T26, T1t);
Tk = fma (-KP876768831, Tj, Tg);
TE = fma (KP918985947, TD, To);
T23 = fma (-KP876768831, T22, T1k);
T28 = fma (KP918985947, T27, T1s);
Tl = fma (-KP959492973, Tk, T1);
{
real_t T1U, T1T, T24, T1Y;
T1T = fma (-KP778434453, T1S, T1k);
T24 = fma (-KP959492973, T23, T1f);
T1Y = fma (KP830830026, T1X, T1u);
T1U = fma (-KP876768831, T1T, T1i);
v10.y = fma (-KP989821441, T28, T24);
v10.x = fma (-KP989821441, TE, Tl);
v1.x = fma (KP989821441, TE, Tl);
v1.y = fma (KP989821441, T28, T24);
T1Z = fma (-KP918985947, T1Y, T1r);
T1V = fma (-KP959492973, T1U, T1f);
}
TH = fma (-KP778434453, TG, Tg);
TM = fma (KP830830026, TL, Tx);
}
{
real_t TS, TW, T1M, TZ, T14, T1Q;
{
real_t TN, TR, TV, TJ, TI, TQ, T1P, T1L, T1K;
TQ = fma (-KP778434453, TP, Td);
TI = fma (-KP876768831, TH, Ta);
TN = fma (-KP918985947, TM, Tu);
TR = fma (-KP876768831, TQ, T7);
TV = fma (-KP830830026, TU, To);
TJ = fma (-KP959492973, TI, T1);
T1K = fma (-KP778434453, T1J, T1j);
TS = fma (-KP959492973, TR, T1);
TW = fma (-KP918985947, TV, Tx);
v9.y = fma (KP989821441, T1Z, T1V);
v9.x = fma (KP989821441, TN, TJ);
v2.x = fma (-KP989821441, TN, TJ);
v2.y = fma (-KP989821441, T1Z, T1V);
T1L = fma (-KP876768831, T1K, T1h);
T1P = fma (-KP830830026, T1O, T1s);
T1M = fma (-KP959492973, T1L, T1f);
TZ = fma (-KP778434453, TY, Ta);
T14 = fma (-KP830830026, T13, TA);
T1Q = fma (-KP918985947, T1P, T1u);
}
{
real_t T15, T11, T1C, T1G, T1B, T10;
T1B = fma (-KP778434453, T1A, T1i);
T10 = fma (-KP876768831, TZ, Td);
T15 = fma (KP918985947, T14, Tr);
v8.y = fma (-KP989821441, T1Q, T1M);
v8.x = fma (-KP989821441, TW, TS);
v3.x = fma (KP989821441, TW, TS);
v3.y = fma (KP989821441, T1Q, T1M);
T11 = fma (-KP959492973, T10, T1);
T1C = fma (-KP876768831, T1B, T1j);
T1G = fma (-KP830830026, T1F, T1q);
{
real_t T1D, T1H, T1o, T1x, T1n, T18;
T1n = fma (-KP778434453, T1m, T1h);
T1D = fma (-KP959492973, T1C, T1f);
T1H = fma (KP918985947, T1G, T1t);
T1o = fma (-KP876768831, T1n, T1g);
T1x = fma (-KP830830026, T1w, T1r);
T18 = fma (-KP778434453, T17, T7);
v7.x = fma (KP989821441, T15, T11);
v7.y = fma (KP989821441, T1H, T1D);
v4.y = fma (-KP989821441, T1H, T1D);
v4.x = fma (-KP989821441, T15, T11);
T1p = fma (-KP959492973, T1o, T1f);
T1y = fma (-KP918985947, T1x, T1q);
T19 = fma (-KP876768831, T18, T4);
T1d = fma (-KP830830026, T1c, Tu);
}
}
}
}
}
T1a = fma (-KP959492973, T19, T1);
T1e = fma (-KP918985947, T1d, TA);
v5.y = fma (KP989821441, T1y, T1p);
v5.x = fma (KP989821441, T1e, T1a);
v6.x = fma (-KP989821441, T1e, T1a);
v6.y = fma (-KP989821441, T1y, T1p);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 11;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
double2 v9 = x[9 * threads];
double2 v10 = x[10 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.5711986642890533 * k / p));
v2 = mul(v2, twiddle((double)-1.142397328578107 * k / p));
v3 = mul(v3, twiddle((double)-1.71359599286716 * k / p));
v4 = mul(v4, twiddle((double)-2.284794657156213 * k / p));
v5 = mul(v5, twiddle((double)-2.855993321445267 * k / p));
v6 = mul(v6, twiddle((double)-3.42719198573432 * k / p));
v7 = mul(v7, twiddle((double)-3.998390650023373 * k / p));
v8 = mul(v8, twiddle((double)-4.569589314312426 * k / p));
v9 = mul(v9, twiddle((double)-5.140787978601479 * k / p));
v10 = mul(v10, twiddle((double)-5.711986642890533 * k / p));
}
dft11(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10);
const size_t j = k + (i - k) * 11;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 16 FP additions, 0 FP multiplications,
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
* 16 stack variables, 0 constants, and 16 memory accesses
*/
DEVICE void
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
{
{
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td;
{
real_t T7, T1, T2, T8;
T7 = v0.y;
T1 = v0.x;
T2 = v2.x;
T8 = v2.y;
Tc = v1.y;
T4 = v1.x;
Tb = T1 - T2;
T3 = T1 + T2;
Tf = T7 + T8;
T9 = T7 - T8;
T5 = v3.x;
Td = v3.y;
}
{
real_t T6, Ta, Te, Tg;
T6 = T4 + T5;
Ta = T4 - T5;
Te = Tc - Td;
Tg = Tc + Td;
v0.x = T3 + T6;
v0.y = Tf + Tg;
v2.y = Tf - Tg;
v2.x = T3 - T6;
v3.y = Ta + T9;
v3.x = Tb - Te;
v1.x = Tb + Te;
v1.y = T9 - Ta;
}
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 4;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p));
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p));
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p));
}
dft4(&v0, &v1, &v2, &v3);
const size_t j = k + (i - k) * 4;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 7 -name dft7 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 60 FP additions, 42 FP multiplications,
* (or, 18 additions, 0 multiplications, 42 fused multiply/add),
* 57 stack variables, 6 constants, and 28 memory accesses
*/
DEVICE void
dft7 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
{
const real_t KP974927912 =
+0.974927912181823607018131682993931217232785801;
const real_t KP900968867 =
+0.900968867902419126236102319507445051165919162;
const real_t KP801937735 =
+0.801937735804838252472204639014890102331838324;
const real_t KP692021471 =
+0.692021471630095869627814897002069140197260599;
const real_t KP356895867 =
+0.356895867892209443894399510021300583399127187;
const real_t KP554958132 =
+0.554958132087371191422194871006410481067288862;
{
real_t Tz, Tt, Tr, TP, Ty, TK, TN, TE, Tw, TF;
{
real_t T1, Th, Ti, Tk, Tl, TI, T4, TG, Ta, TT, Tp, TH, T7, TJ,
TO;
real_t Tu, Tb, TB, Tg;
Tz = v0.y;
T1 = v0.x;
{
real_t Te, T5, T6, Tf;
{
real_t T2, T3, T8, T9;
Th = v1.y;
T2 = v1.x;
T3 = v6.x;
Ti = v6.y;
Tk = v3.y;
T8 = v3.x;
T9 = v4.x;
Tl = v4.y;
Te = v2.y;
T5 = v2.x;
TI = T3 - T2;
T4 = T2 + T3;
TG = T9 - T8;
Ta = T8 + T9;
T6 = v5.x;
Tf = v5.y;
}
TT = fma (KP554958132, TG, TI);
Tp = fma (-KP356895867, T4, Ta);
TH = T6 - T5;
T7 = T5 + T6;
TJ = fma (-KP554958132, TI, TH);
TO = fma (KP554958132, TH, TG);
Tu = fma (-KP356895867, Ta, T7);
Tb = fma (-KP356895867, T7, T4);
TB = Te + Tf;
Tg = Te - Tf;
}
{
real_t Tm, TA, Tj, TD, Ts, TL, Tx, TU, To, TR, Td;
{
real_t TC, TQ, Tn, Tc;
TC = Tk + Tl;
Tm = Tk - Tl;
TA = Th + Ti;
Tj = Th - Ti;
TD = fma (-KP356895867, TC, TB);
Ts = fma (KP554958132, Tg, Tm);
TL = fma (-KP356895867, TA, TC);
TQ = fma (-KP356895867, TB, TA);
Tx = fma (-KP554958132, Tj, Tg);
Tn = fma (KP554958132, Tm, Tj);
v0.y = Tz + TA + TB + TC;
v0.x = T1 + T4 + T7 + Ta;
Tc = fma (-KP692021471, Tb, Ta);
TU = fma (KP801937735, TT, TH);
To = fma (KP801937735, Tn, Tg);
TR = fma (-KP692021471, TQ, TC);
Td = fma (-KP900968867, Tc, T1);
}
{
real_t TM, TS, Tq, Tv;
Tt = fma (-KP801937735, Ts, Tj);
Tq = fma (-KP692021471, Tp, T7);
TS = fma (-KP900968867, TR, Tz);
Tr = fma (-KP900968867, Tq, T1);
v6.y = fma (-KP974927912, TU, TS);
v6.x = fma (-KP974927912, To, Td);
v1.x = fma (KP974927912, To, Td);
v1.y = fma (KP974927912, TU, TS);
TP = fma (-KP801937735, TO, TI);
TM = fma (-KP692021471, TL, TB);
Ty = fma (-KP801937735, Tx, Tm);
Tv = fma (-KP692021471, Tu, T4);
TK = fma (-KP801937735, TJ, TG);
TN = fma (-KP900968867, TM, Tz);
TE = fma (-KP692021471, TD, TA);
Tw = fma (-KP900968867, Tv, T1);
}
}
}
v5.y = fma (-KP974927912, TP, TN);
v5.x = fma (-KP974927912, Tt, Tr);
v2.x = fma (KP974927912, Tt, Tr);
v2.y = fma (KP974927912, TP, TN);
TF = fma (-KP900968867, TE, Tz);
v4.x = fma (-KP974927912, Ty, Tw);
v4.y = fma (-KP974927912, TK, TF);
v3.y = fma (KP974927912, TK, TF);
v3.x = fma (KP974927912, Ty, Tw);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 7;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.8975979010256552 * k / p));
v2 = mul(v2, twiddle((double)-1.79519580205131 * k / p));
v3 = mul(v3, twiddle((double)-2.692793703076966 * k / p));
v4 = mul(v4, twiddle((double)-3.590391604102621 * k / p));
v5 = mul(v5, twiddle((double)-4.487989505128276 * k / p));
v6 = mul(v6, twiddle((double)-5.385587406153931 * k / p));
}
dft7(&v0, &v1, &v2, &v3, &v4, &v5, &v6);
const size_t j = k + (i - k) * 7;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 11 -name dft11 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 140 FP additions, 110 FP multiplications,
* (or, 30 additions, 0 multiplications, 110 fused multiply/add),
* 96 stack variables, 10 constants, and 44 memory accesses
*/
DEVICE void
dft11 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
{
const real_t KP989821441 =
+0.989821441880932732376092037776718787376519372;
const real_t KP959492973 =
+0.959492973614497389890368057066327699062454848;
const real_t KP918985947 =
+0.918985947228994779780736114132655398124909697;
const real_t KP876768831 =
+0.876768831002589333891339807079336796764054852;
const real_t KP830830026 =
+0.830830026003772851058548298459246407048009821;
const real_t KP778434453 =
+0.778434453334651800608337670740821884709317477;
const real_t KP634356270 =
+0.634356270682424498893150776899916060542806975;
const real_t KP715370323 =
+0.715370323453429719112414662767260662417897278;
const real_t KP342584725 =
+0.342584725681637509502641509861112333758894680;
const real_t KP521108558 =
+0.521108558113202722944698153526659300680427422;
{
real_t Th, TE, T1p, T1y, T1f, T1j, T1g, T1k;
{
real_t T1, TG, T4, TC, Tg, TF, T7, Ta, TD, Td, TI, T1S, T1J,
TR, T10;
real_t T21, T1m, T19, T1A, T1i, T1t, Tk, T1u, Tw, T1r, Tn,
T1q, Tq, T1s, Tt;
real_t T26, TV, TM, Ty, T1X, T1O, T1w, T13, T1E, T1c;
Th = v0.y;
T1 = v0.x;
{
real_t Ti, Tj, Tu, Tv, Tl, Tm, To, Tp, Tr, Ts;
{
real_t Tb, Tc, TH, T1R;
{
real_t T2, T3, Te, Tf;
Ti = v1.y;
T2 = v1.x;
T3 = v10.x;
Tj = v10.y;
Tu = v5.y;
Te = v5.x;
Tf = v6.x;
Tv = v6.y;
{
real_t T5, T6, T8, T9;
Tl = v2.y;
T5 = v2.x;
TG = T2 - T3;
T4 = T2 + T3;
TC = Te - Tf;
Tg = Te + Tf;
T6 = v9.x;
Tm = v9.y;
To = v3.y;
T8 = v3.x;
T9 = v8.x;
Tp = v8.y;
Tr = v4.y;
Tb = v4.x;
TF = T5 - T6;
T7 = T5 + T6;
TE = T8 - T9;
Ta = T8 + T9;
Tc = v7.x;
Ts = v7.y;
}
}
TH = fma (-KP521108558, TG, TF);
T1R = fma (-KP342584725, T7, Tg);
{
real_t T1l, T18, T1z, T1h;
{
real_t TQ, TZ, T20, T1I;
T1I = fma (-KP342584725, T4, Ta);
TD = Tb - Tc;
Td = Tb + Tc;
TI = fma (-KP715370323, TH, TE);
T1S = fma (-KP634356270, T1R, T4);
TQ = fma (KP521108558, TD, TE);
TZ = fma (-KP521108558, TF, TD);
T20 = fma (-KP342584725, Tg, Td);
T1J = fma (-KP634356270, T1I, Tg);
TR = fma (KP715370323, TQ, TG);
T10 = fma (KP715370323, TZ, TC);
T21 = fma (-KP634356270, T20, Ta);
T1l = fma (-KP342584725, Ta, T7);
}
T18 = fma (KP521108558, TE, TC);
T1z = fma (-KP342584725, Td, T4);
T1h = fma (KP521108558, TC, TG);
T1m = fma (-KP634356270, T1l, Td);
T19 = fma (-KP715370323, T18, TF);
T1A = fma (-KP634356270, T1z, T7);
T1i = fma (KP715370323, T1h, TD);
}
}
T1t = Tj - Ti;
Tk = Ti + Tj;
T1u = Tv - Tu;
Tw = Tu + Tv;
T1r = Tm - Tl;
Tn = Tl + Tm;
T1q = Tp - To;
Tq = To + Tp;
{
real_t Tx, T1W, T25, TL, TU, T1N, T1v;
T25 = fma (-KP521108558, T1t, T1r);
TL = fma (-KP342584725, Tn, Tw);
TU = fma (-KP342584725, Tk, Tq);
T1s = Ts - Tr;
Tt = Tr + Ts;
T26 = fma (-KP715370323, T25, T1q);
TV = fma (-KP634356270, TU, Tw);
TM = fma (-KP634356270, TL, Tk);
Tx = fma (-KP342584725, Tw, Tt);
T1W = fma (KP521108558, T1s, T1q);
T1N = fma (-KP521108558, T1r, T1s);
T1v = fma (KP521108558, T1u, T1t);
Ty = fma (-KP634356270, Tx, Tq);
T1X = fma (KP715370323, T1W, T1t);
T1O = fma (KP715370323, T1N, T1u);
T1w = fma (KP715370323, T1v, T1s);
T13 = fma (-KP342584725, Tt, Tk);
T1E = fma (KP521108558, T1q, T1u);
T1c = fma (-KP342584725, Tq, Tn);
}
}
{
real_t T14, T1F, T1d, T1Z, T1V, TN, TS, Tz, TJ;
T14 = fma (-KP634356270, T13, Tn);
T1F = fma (-KP715370323, T1E, T1r);
T1d = fma (-KP634356270, T1c, Tt);
v0.y = Th + Tk + Tn + Tq + Tt + Tw;
v0.x = T1 + T4 + T7 + Ta + Td + Tg;
Tz = fma (-KP778434453, Ty, Tn);
TJ = fma (-KP830830026, TI, TD);
{
real_t TK, T23, T28, TB, TA, T22, T27;
T22 = fma (-KP778434453, T21, T7);
T27 = fma (-KP830830026, T26, T1s);
TA = fma (-KP876768831, Tz, Tk);
TK = fma (-KP918985947, TJ, TC);
T23 = fma (-KP876768831, T22, T4);
T28 = fma (-KP918985947, T27, T1u);
TB = fma (-KP959492973, TA, Th);
{
real_t T1U, T1T, T24, T1Y;
T1T = fma (-KP778434453, T1S, Ta);
T24 = fma (-KP959492973, T23, T1);
T1Y = fma (-KP830830026, T1X, T1u);
T1U = fma (-KP876768831, T1T, Td);
v5.x = fma (KP989821441, T28, T24);
v5.y = fma (KP989821441, TK, TB);
v6.y = fma (-KP989821441, TK, TB);
v6.x = fma (-KP989821441, T28, T24);
T1Z = fma (KP918985947, T1Y, T1r);
T1V = fma (-KP959492973, T1U, T1);
}
TN = fma (-KP778434453, TM, Tq);
TS = fma (-KP830830026, TR, TC);
}
{
real_t TY, T12, T1M, T15, T1a, T1Q;
{
real_t TT, TX, T11, TP, TO, TW, T1P, T1L, T1K;
TW = fma (-KP778434453, TV, Tt);
TO = fma (-KP876768831, TN, Tt);
TT = fma (KP918985947, TS, TF);
TX = fma (-KP876768831, TW, Tn);
T11 = fma (-KP830830026, T10, TE);
TP = fma (-KP959492973, TO, Th);
T1K = fma (-KP778434453, T1J, Td);
TY = fma (-KP959492973, TX, Th);
T12 = fma (-KP918985947, T11, TG);
v7.x = fma (KP989821441, T1Z, T1V);
v7.y = fma (KP989821441, TT, TP);
v4.y = fma (-KP989821441, TT, TP);
v4.x = fma (-KP989821441, T1Z, T1V);
T1L = fma (-KP876768831, T1K, T7);
T1P = fma (-KP830830026, T1O, T1q);
T1M = fma (-KP959492973, T1L, T1);
T15 = fma (-KP778434453, T14, Tw);
T1a = fma (KP830830026, T19, TG);
T1Q = fma (-KP918985947, T1P, T1t);
}
{
real_t T1b, T17, T1C, T1G, T1B, T16;
T1B = fma (-KP778434453, T1A, Tg);
T16 = fma (-KP876768831, T15, Tq);
T1b = fma (-KP918985947, T1a, TD);
v3.x = fma (KP989821441, T1Q, T1M);
v3.y = fma (KP989821441, T12, TY);
v8.y = fma (-KP989821441, T12, TY);
v8.x = fma (-KP989821441, T1Q, T1M);
T17 = fma (-KP959492973, T16, Th);
T1C = fma (-KP876768831, T1B, Ta);
T1G = fma (KP830830026, T1F, T1t);
{
real_t T1D, T1H, T1o, T1x, T1n, T1e;
T1n = fma (-KP778434453, T1m, T4);
T1D = fma (-KP959492973, T1C, T1);
T1H = fma (-KP918985947, T1G, T1s);
T1o = fma (-KP876768831, T1n, Tg);
T1x = fma (KP830830026, T1w, T1r);
T1e = fma (-KP778434453, T1d, Tk);
v9.y = fma (KP989821441, T1b, T17);
v9.x = fma (KP989821441, T1H, T1D);
v2.x = fma (-KP989821441, T1H, T1D);
v2.y = fma (-KP989821441, T1b, T17);
T1p = fma (-KP959492973, T1o, T1);
T1y = fma (KP918985947, T1x, T1q);
T1f = fma (-KP876768831, T1e, Tw);
T1j = fma (KP830830026, T1i, TF);
}
}
}
}
}
T1g = fma (-KP959492973, T1f, Th);
T1k = fma (KP918985947, T1j, TE);
v10.x = fma (-KP989821441, T1y, T1p);
v10.y = fma (-KP989821441, T1k, T1g);
v1.y = fma (KP989821441, T1k, T1g);
v1.x = fma (KP989821441, T1y, T1p);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 11;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
double2 v9 = x[9 * threads];
double2 v10 = x[10 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.5711986642890533 * k / p));
v2 = mul(v2, twiddle((double)-1.142397328578107 * k / p));
v3 = mul(v3, twiddle((double)-1.71359599286716 * k / p));
v4 = mul(v4, twiddle((double)-2.284794657156213 * k / p));
v5 = mul(v5, twiddle((double)-2.855993321445267 * k / p));
v6 = mul(v6, twiddle((double)-3.42719198573432 * k / p));
v7 = mul(v7, twiddle((double)-3.998390650023373 * k / p));
v8 = mul(v8, twiddle((double)-4.569589314312426 * k / p));
v9 = mul(v9, twiddle((double)-5.140787978601479 * k / p));
v10 = mul(v10, twiddle((double)-5.711986642890533 * k / p));
}
dft11(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10);
const size_t j = k + (i - k) * 11;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 16 FP additions, 0 FP multiplications,
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
* 16 stack variables, 0 constants, and 16 memory accesses
*/
DEVICE void
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
{
{
real_t Tc, T4, Tb, T3, Tf, Ta, T5, Td;
{
real_t T8, T1, T2, T9;
T8 = v0.y;
T1 = v0.x;
T2 = v2.x;
T9 = v2.y;
Tc = v1.y;
T4 = v1.x;
Tb = T1 - T2;
T3 = T1 + T2;
Tf = T8 + T9;
Ta = T8 - T9;
T5 = v3.x;
Td = v3.y;
}
{
real_t T6, T7, Te, Tg;
T6 = T4 + T5;
T7 = T4 - T5;
Te = Tc - Td;
Tg = Tc + Td;
v0.x = T3 + T6;
v0.y = Tf + Tg;
v2.y = Tf - Tg;
v2.x = T3 - T6;
v3.y = Ta - T7;
v3.x = Tb + Te;
v1.x = Tb - Te;
v1.y = T7 + Ta;
}
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 4;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p));
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p));
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p));
}
dft4(&v0, &v1, &v2, &v3);
const size_t j = k + (i - k) * 4;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 7 -name dft7 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 60 FP additions, 42 FP multiplications,
* (or, 18 additions, 0 multiplications, 42 fused multiply/add),
* 61 stack variables, 6 constants, and 28 memory accesses
*/
DEVICE void
dft7 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
{
const real_t KP974927912 =
+0.974927912181823607018131682993931217232785801;
const real_t KP900968867 =
+0.900968867902419126236102319507445051165919162;
const real_t KP692021471 =
+0.692021471630095869627814897002069140197260599;
const real_t KP801937735 =
+0.801937735804838252472204639014890102331838324;
const real_t KP356895867 =
+0.356895867892209443894399510021300583399127187;
const real_t KP554958132 =
+0.554958132087371191422194871006410481067288862;
{
real_t T1, Tx, TP, Tv, TC, TK, TN, TE, TA, TF;
{
real_t Tb, Ti, Tj, Tf, Tq, T4, To, Ta, Tg, TB, TL, Tp, T7, Tw,
Tr;
real_t TD, TQ, TH, Te;
Tb = v0.y;
T1 = v0.x;
{
real_t Tc, Td, T5, T6;
{
real_t T2, T3, T8, T9;
Tc = v1.y;
T2 = v1.x;
T3 = v6.x;
Td = v6.y;
Ti = v3.y;
T8 = v3.x;
T9 = v4.x;
Tj = v4.y;
Tf = v2.y;
T5 = v2.x;
Tq = T2 - T3;
T4 = T2 + T3;
To = T8 - T9;
Ta = T8 + T9;
T6 = v5.x;
Tg = v5.y;
}
TB = fma (KP554958132, To, Tq);
TL = fma (-KP356895867, T4, Ta);
Tp = T5 - T6;
T7 = T5 + T6;
Tw = fma (KP554958132, Tp, To);
Tr = fma (-KP554958132, Tq, Tp);
TD = fma (-KP356895867, T7, T4);
TQ = fma (-KP356895867, Ta, T7);
TH = Td - Tc;
Te = Tc + Td;
}
{
real_t Tk, TG, Th, TJ, Tt, TO, Ty, Ts, TU, Tm, TR, TI, TT,
Tl;
TI = Tj - Ti;
Tk = Ti + Tj;
TG = Tg - Tf;
Th = Tf + Tg;
TJ = fma (KP554958132, TI, TH);
Tt = fma (-KP356895867, Te, Tk);
TO = fma (KP554958132, TG, TI);
TT = fma (-KP554958132, TH, TG);
Ty = fma (-KP356895867, Th, Te);
Tl = fma (-KP356895867, Tk, Th);
v0.y = Tb + Te + Th + Tk;
v0.x = T1 + T4 + T7 + Ta;
Ts = fma (-KP801937735, Tr, To);
TU = fma (-KP801937735, TT, TI);
Tm = fma (-KP692021471, Tl, Te);
TR = fma (-KP692021471, TQ, T4);
{
real_t TM, Tu, Tn, TS, Tz;
Tx = fma (-KP801937735, Tw, Tq);
Tu = fma (-KP692021471, Tt, Th);
Tn = fma (-KP900968867, Tm, Tb);
TS = fma (-KP900968867, TR, T1);
TP = fma (-KP801937735, TO, TH);
Tv = fma (-KP900968867, Tu, Tb);
v3.y = fma (KP974927912, Ts, Tn);
v3.x = fma (KP974927912, TU, TS);
v4.x = fma (-KP974927912, TU, TS);
v4.y = fma (-KP974927912, Ts, Tn);
TM = fma (-KP692021471, TL, T7);
TC = fma (KP801937735, TB, Tp);
Tz = fma (-KP692021471, Ty, Tk);
TK = fma (KP801937735, TJ, TG);
TN = fma (-KP900968867, TM, T1);
TE = fma (-KP692021471, TD, Ta);
TA = fma (-KP900968867, Tz, Tb);
}
}
}
v2.x = fma (KP974927912, TP, TN);
v2.y = fma (KP974927912, Tx, Tv);
v5.y = fma (-KP974927912, Tx, Tv);
v5.x = fma (-KP974927912, TP, TN);
TF = fma (-KP900968867, TE, T1);
v1.y = fma (KP974927912, TC, TA);
v1.x = fma (KP974927912, TK, TF);
v6.x = fma (-KP974927912, TK, TF);
v6.y = fma (-KP974927912, TC, TA);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 7;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.8975979010256552 * k / p));
v2 = mul(v2, twiddle((double)-1.79519580205131 * k / p));
v3 = mul(v3, twiddle((double)-2.692793703076966 * k / p));
v4 = mul(v4, twiddle((double)-3.590391604102621 * k / p));
v5 = mul(v5, twiddle((double)-4.487989505128276 * k / p));
v6 = mul(v6, twiddle((double)-5.385587406153931 * k / p));
}
dft7(&v0, &v1, &v2, &v3, &v4, &v5, &v6);
const size_t j = k + (i - k) * 7;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
/tmp/vexcl/tests/fft.cpp:94: error in "test_dimensions": absolute value of rms(back, inp){0.34277631970801009} exceeds 1e-08
FFT(C2C) size=1690 batch=1
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 2 -name dft2 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 4 FP additions, 0 FP multiplications,
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
* 6 stack variables, 0 constants, and 8 memory accesses
*/
DEVICE void
dft2 (real2_t * u0, real2_t * u1)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
{
{
real_t T3, T1, T2, T4;
T3 = v0.y;
T1 = v0.x;
T2 = v1.x;
T4 = v1.y;
v0.x = T1 + T2;
v0.y = T3 + T4;
v1.y = T3 - T4;
v1.x = T1 - T2;
}
}
*u0 = v0;
*u1 = v1;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 2;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-3.141592653589793 * k / p));
}
dft2(&v0, &v1);
const size_t j = k + (i - k) * 2;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 32 FP additions, 18 FP multiplications,
* (or, 14 additions, 0 multiplications, 18 fused multiply/add),
* 41 stack variables, 4 constants, and 20 memory accesses
*/
DEVICE void
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
{
const real_t KP951056516 =
+0.951056516295153572116439333379382143405698634;
const real_t KP559016994 =
+0.559016994374947424102293417182819058860154590;
const real_t KP250000000 =
+0.250000000000000000000000000000000000000000000;
const real_t KP618033988 =
+0.618033988749894848204586834365638117720309180;
{
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv;
{
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9;
Tl = v0.y;
T1 = v0.x;
{
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7;
Tc = v1.y;
T2 = v1.x;
T3 = v4.x;
Td = v4.y;
Tf = v2.y;
T5 = v2.x;
T6 = v3.x;
Tg = v3.y;
Ts = T2 - T3;
T4 = T2 + T3;
Tt = T5 - T6;
T7 = T5 + T6;
T8 = T4 + T7;
Ta = T4 - T7;
Te = Tc - Td;
Tm = Tc + Td;
Tn = Tf + Tg;
Th = Tf - Tg;
}
To = Tm + Tn;
Tq = Tm - Tn;
Ti = fma (KP618033988, Th, Te);
Tk = fma (-KP618033988, Te, Th);
v0.y = Tl + To;
v0.x = T1 + T8;
T9 = fma (-KP250000000, T8, T1);
Tu = fma (KP618033988, Tt, Ts);
Tw = fma (-KP618033988, Ts, Tt);
Tp = fma (-KP250000000, To, Tl);
Tb = fma (KP559016994, Ta, T9);
Tj = fma (-KP559016994, Ta, T9);
}
Tr = fma (KP559016994, Tq, Tp);
Tv = fma (-KP559016994, Tq, Tp);
v2.x = fma (-KP951056516, Tk, Tj);
v2.y = fma (KP951056516, Tw, Tv);
v3.y = fma (-KP951056516, Tw, Tv);
v3.x = fma (KP951056516, Tk, Tj);
v4.x = fma (-KP951056516, Ti, Tb);
v4.y = fma (KP951056516, Tu, Tr);
v1.y = fma (-KP951056516, Tu, Tr);
v1.x = fma (KP951056516, Ti, Tb);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 5;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p));
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p));
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p));
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p));
}
dft5(&v0, &v1, &v2, &v3, &v4);
const size_t j = k + (i - k) * 5;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 13 -name dft13 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 176 FP additions, 114 FP multiplications,
* (or, 62 additions, 0 multiplications, 114 fused multiply/add),
* 114 stack variables, 25 constants, and 52 memory accesses
*/
DEVICE void
dft13 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10, real2_t * u11, real2_t * u12)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
real2_t v11 = *u11;
real2_t v12 = *u12;
{
const real_t KP600477271 =
+0.600477271932665282925769253334763009352012849;
const real_t KP875502302 =
+0.875502302409147941146295545768755143177842006;
const real_t KP520028571 =
+0.520028571888864619117130500499232802493238139;
const real_t KP575140729 =
+0.575140729474003121368385547455453388461001608;
const real_t KP300462606 =
+0.300462606288665774426601772289207995520941381;
const real_t KP516520780 =
+0.516520780623489722840901288569017135705033622;
const real_t KP968287244 =
+0.968287244361984016049539446938120421179794516;
const real_t KP503537032 =
+0.503537032863766627246873853868466977093348562;
const real_t KP251768516 =
+0.251768516431883313623436926934233488546674281;
const real_t KP581704778 =
+0.581704778510515730456870384989698884939833902;
const real_t KP859542535 =
+0.859542535098774820163672132761689612766401925;
const real_t KP083333333 =
+0.083333333333333333333333333333333333333333333;
const real_t KP957805992 =
+0.957805992594665126462521754605754580515587217;
const real_t KP522026385 =
+0.522026385161275033714027226654165028300441940;
const real_t KP853480001 =
+0.853480001859823990758994934970528322872359049;
const real_t KP769338817 =
+0.769338817572980603471413688209101117038278899;
const real_t KP612264650 =
+0.612264650376756543746494474777125408779395514;
const real_t KP038632954 =
+0.038632954644348171955506895830342264440241080;
const real_t KP302775637 =
+0.302775637731994646559610633735247973125648287;
const real_t KP514918778 =
+0.514918778086315755491789696138117261566051239;
const real_t KP686558370 =
+0.686558370781754340655719594850823015421401653;
const real_t KP226109445 =
+0.226109445035782405468510155372505010481906348;
const real_t KP301479260 =
+0.301479260047709873958013540496673347309208464;
const real_t KP866025403 =
+0.866025403784438646763723170752936183471402627;
const real_t KP500000000 =
+0.500000000000000000000000000000000000000000000;
{
real_t T1C, T1A, T1t, T1B, T2B, T2H, T2I, T2G;
{
real_t T1P, T1, T2n, T2o, To, TH, T2h, T2k, TE, TB, TF, Tw,
T2j, T2c, T1m;
real_t T1W, T1X, T1c, T19, T1j, T12, T1f, T21, T24, T27, T1U;
T1P = v0.y;
T1 = v0.x;
{
real_t TK, TL, T16, TY, TZ, T13, TW, TV, TN, TO, TQ, TR,
T2b, Tv, Ts;
real_t T2a;
{
real_t T2d, Tf, Tq, Ty, Tb, Tr, T6, Tx, Ti, Tt, Tu,
Tl;
{
real_t T7, T8, T9, Td, Te;
TK = v8.y;
Td = v8.x;
Te = v5.x;
TL = v5.y;
T16 = v12.y;
T7 = v12.x;
T8 = v10.x;
TY = v10.y;
TZ = v4.y;
T9 = v4.x;
T2d = Td - Te;
Tf = Td + Te;
{
real_t T2, Ta, T3, T4;
T2 = v1.x;
T13 = v1.y;
Ta = T8 + T9;
Tq = T8 - T9;
TW = v3.y;
T3 = v3.x;
T4 = v9.x;
TV = v9.y;
{
real_t Tg, T5, Th, Tj, Tk;
TN = v11.y;
Tg = v11.x;
Ty = fma (KP500000000, Ta, -(T7));
Tb = T7 + Ta;
Tr = T4 - T3;
T5 = T3 + T4;
Th = v6.x;
TO = v6.y;
TQ = v7.y;
Tj = v7.x;
Tk = v2.x;
TR = v2.y;
T6 = T2 + T5;
Tx = fma (-KP500000000, T5, T2);
Ti = Tg + Th;
Tt = Tg - Th;
Tu = Tj - Tk;
Tl = Tj + Tk;
}
}
}
{
real_t Tc, Tm, T2e, T2g;
Tc = T6 + Tb;
T2n = T6 - Tb;
T2b = Ti - Tl;
Tm = Ti + Tl;
T2e = Tt + Tu;
Tv = Tt - Tu;
Ts = Tq - Tr;
T2g = Tr + Tq;
{
real_t Tz, TA, Tn, T2f;
Tz = Tx - Ty;
T2a = Tx + Ty;
TA = fma (-KP500000000, Tm, Tf);
Tn = Tf + Tm;
T2f = fma (-KP500000000, T2e, T2d);
T2o = T2d + T2e;
To = Tc + Tn;
TH = Tc - Tn;
T2h = fma (KP866025403, T2g, T2f);
T2k = fma (-KP866025403, T2g, T2f);
TE = Tz - TA;
TB = Tz + TA;
}
}
}
{
real_t T1R, TM, T10, T18, T1l, TX, T1k, T15, TP, T1a,
T1b, TS, T17, T14;
TF = Ts - Tv;
Tw = Ts + Tv;
T2j = fma (-KP866025403, T2b, T2a);
T2c = fma (KP866025403, T2b, T2a);
T1R = TK + TL;
TM = TK - TL;
T17 = TY + TZ;
T10 = TY - TZ;
T18 = fma (KP500000000, T17, -(T16));
T1l = T16 + T17;
TX = TV - TW;
T14 = TW + TV;
T1k = T13 + T14;
T15 = fma (-KP500000000, T14, T13);
TP = TN - TO;
T1a = TN + TO;
T1b = TQ + TR;
TS = TQ - TR;
{
real_t T1Q, T11, TT, T1S;
T1Q = T1k + T1l;
T1m = T1k - T1l;
T11 = TX + T10;
T1W = T10 - TX;
T1X = TP - TS;
TT = TP + TS;
T1S = T1a + T1b;
T1c = T1a - T1b;
{
real_t T1Z, TU, T1T, T20;
T19 = T15 + T18;
T1Z = T15 - T18;
T1j = TM + TT;
TU = fma (-KP500000000, TT, TM);
T1T = T1R + T1S;
T20 = fma (-KP500000000, T1S, T1R);
T12 = fma (KP866025403, T11, TU);
T1f = fma (-KP866025403, T11, TU);
T21 = T1Z + T20;
T24 = T1Z - T20;
T27 = T1Q - T1T;
T1U = T1Q + T1T;
}
}
}
}
{
real_t T1g, T1d, T25, T1Y;
T1g = fma (-KP866025403, T1c, T19);
T1d = fma (KP866025403, T1c, T19);
T25 = T1W - T1X;
T1Y = T1W + T1X;
v0.y = T1P + T1U;
v0.x = T1 + To;
{
real_t T1O, T1o, TJ, T1N, T1L, T1F, T1K, T1M;
{
real_t TC, T1J, T1z, T1w, T1I, Tp, T1E, T1q, TI,
T1s;
{
real_t TG, T1n, T1G, T1u, T1e, T1h, T1v, T1x,
T1y, T1H, T1i;
TC = fma (KP301479260, TB, Tw);
T1x = fma (-KP226109445, Tw, TB);
T1y = fma (KP686558370, TE, TF);
TG = fma (-KP514918778, TF, TE);
T1n = fma (-KP302775637, T1m, T1j);
T1G = fma (KP302775637, T1j, T1m);
T1u = fma (-KP038632954, T12, T1d);
T1e = fma (KP038632954, T1d, T12);
T1h = fma (KP612264650, T1g, T1f);
T1v = fma (-KP612264650, T1f, T1g);
T1J = fma (KP769338817, T1y, T1x);
T1z = fma (-KP769338817, T1y, T1x);
T1H = fma (-KP853480001, T1v, T1u);
T1w = fma (KP853480001, T1v, T1u);
T1I = fma (-KP522026385, T1H, T1G);
T1O = fma (KP957805992, T1G, T1H);
Tp = fma (-KP083333333, To, T1);
T1E = fma (KP853480001, T1h, T1e);
T1i = fma (-KP853480001, T1h, T1e);
T1q = fma (-KP859542535, TG, TH);
TI = fma (KP581704778, TH, TG);
T1o = fma (KP957805992, T1n, T1i);
T1s = fma (-KP522026385, T1i, T1n);
}
{
real_t T1D, T1p, TD, T1r;
T1p = fma (-KP251768516, TC, Tp);
TD = fma (KP503537032, TC, Tp);
T1C = fma (-KP968287244, T1z, T1w);
T1A = fma (KP968287244, T1z, T1w);
TJ = fma (KP516520780, TI, TD);
T1N = fma (-KP516520780, TI, TD);
T1D = fma (-KP300462606, T1q, T1p);
T1r = fma (KP300462606, T1q, T1p);
T1t = fma (-KP575140729, T1s, T1r);
T1B = fma (KP575140729, T1s, T1r);
T1L = fma (-KP520028571, T1E, T1D);
T1F = fma (KP520028571, T1E, T1D);
T1K = fma (KP875502302, T1J, T1I);
T1M = fma (-KP875502302, T1J, T1I);
}
}
{
real_t T22, T2F, T2N, T2K, T2w, T2A, T1V, T2C,
T28, T2y, T2M, T2q;
{
real_t T26, T2v, T2p, T2i, T2s, T2t, T2l, T2D,
T2E, T2u, T2m;
T2D = fma (-KP226109445, T1Y, T21);
T22 = fma (KP301479260, T21, T1Y);
T26 = fma (-KP514918778, T25, T24);
T2E = fma (KP686558370, T24, T25);
T2v = fma (-KP302775637, T2n, T2o);
T2p = fma (KP302775637, T2o, T2n);
T2i = fma (-KP038632954, T2h, T2c);
T2s = fma (KP038632954, T2c, T2h);
T2t = fma (KP612264650, T2j, T2k);
T2l = fma (-KP612264650, T2k, T2j);
T2F = fma (-KP769338817, T2E, T2D);
T2N = fma (KP769338817, T2E, T2D);
T2K = fma (KP853480001, T2t, T2s);
T2u = fma (-KP853480001, T2t, T2s);
T2w = fma (KP957805992, T2v, T2u);
T2A = fma (-KP522026385, T2u, T2v);
T1V = fma (-KP083333333, T1U, T1P);
T2m = fma (-KP853480001, T2l, T2i);
T2C = fma (KP853480001, T2l, T2i);
T28 = fma (KP581704778, T27, T26);
T2y = fma (-KP859542535, T26, T27);
T2M = fma (-KP522026385, T2m, T2p);
T2q = fma (KP957805992, T2p, T2m);
}
{
real_t T2O, T2Q, T2z, T2P, T2L;
{
real_t T23, T2x, T2r, T29, T2J;
T23 = fma (KP503537032, T22, T1V);
T2x = fma (-KP251768516, T22, T1V);
T2O = fma (-KP875502302, T2N, T2M);
T2Q = fma (KP875502302, T2N, T2M);
T2r = fma (KP516520780, T28, T23);
T29 = fma (-KP516520780, T28, T23);
T2z = fma (KP300462606, T2y, T2x);
T2J = fma (-KP300462606, T2y, T2x);
v12.x = fma (KP600477271, T1o, TJ);
v12.y = fma (-KP600477271, T2w, T2r);
v1.y = fma (KP600477271, T2w, T2r);
v1.x = fma (-KP600477271, T1o, TJ);
v8.x = fma (-KP600477271, T1O, T1N);
v8.y = fma (KP600477271, T2q, T29);
v5.y = fma (-KP600477271, T2q, T29);
v5.x = fma (KP600477271, T1O, T1N);
T2P = fma (KP520028571, T2K, T2J);
T2L = fma (-KP520028571, T2K, T2J);
}
T2B = fma (KP575140729, T2A, T2z);
T2H = fma (-KP575140729, T2A, T2z);
v11.x = fma (-KP575140729, T1M, T1L);
v11.y = fma (KP575140729, T2Q, T2P);
v6.y = fma (-KP575140729, T2Q, T2P);
v6.x = fma (KP575140729, T1M, T1L);
v7.x = fma (-KP575140729, T1K, T1F);
v7.y = fma (KP575140729, T2O, T2L);
v2.y = fma (-KP575140729, T2O, T2L);
v2.x = fma (KP575140729, T1K, T1F);
T2I = fma (KP968287244, T2F, T2C);
T2G = fma (-KP968287244, T2F, T2C);
}
}
}
}
}
v10.x = fma (-KP520028571, T1C, T1B);
v10.y = fma (KP520028571, T2I, T2H);
v4.y = fma (-KP520028571, T2I, T2H);
v4.x = fma (KP520028571, T1C, T1B);
v9.x = fma (-KP520028571, T1A, T1t);
v9.y = fma (KP520028571, T2G, T2B);
v3.y = fma (-KP520028571, T2G, T2B);
v3.x = fma (KP520028571, T1A, T1t);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
*u11 = v11;
*u12 = v12;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 13;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
double2 v9 = x[9 * threads];
double2 v10 = x[10 * threads];
double2 v11 = x[11 * threads];
double2 v12 = x[12 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.483321946706122 * k / p));
v2 = mul(v2, twiddle((double)-0.966643893412244 * k / p));
v3 = mul(v3, twiddle((double)-1.449965840118366 * k / p));
v4 = mul(v4, twiddle((double)-1.933287786824488 * k / p));
v5 = mul(v5, twiddle((double)-2.41660973353061 * k / p));
v6 = mul(v6, twiddle((double)-2.899931680236732 * k / p));
v7 = mul(v7, twiddle((double)-3.383253626942854 * k / p));
v8 = mul(v8, twiddle((double)-3.866575573648976 * k / p));
v9 = mul(v9, twiddle((double)-4.349897520355098 * k / p));
v10 = mul(v10, twiddle((double)-4.833219467061221 * k / p));
v11 = mul(v11, twiddle((double)-5.316541413767341 * k / p));
v12 = mul(v12, twiddle((double)-5.799863360473465 * k / p));
}
dft13(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12);
const size_t j = k + (i - k) * 13;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
y[11 * p] = v11;
y[12 * p] = v12;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 13 -name dft13 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 176 FP additions, 114 FP multiplications,
* (or, 62 additions, 0 multiplications, 114 fused multiply/add),
* 114 stack variables, 25 constants, and 52 memory accesses
*/
DEVICE void
dft13 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10, real2_t * u11, real2_t * u12)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
real2_t v11 = *u11;
real2_t v12 = *u12;
{
const real_t KP600477271 =
+0.600477271932665282925769253334763009352012849;
const real_t KP875502302 =
+0.875502302409147941146295545768755143177842006;
const real_t KP520028571 =
+0.520028571888864619117130500499232802493238139;
const real_t KP575140729 =
+0.575140729474003121368385547455453388461001608;
const real_t KP300462606 =
+0.300462606288665774426601772289207995520941381;
const real_t KP516520780 =
+0.516520780623489722840901288569017135705033622;
const real_t KP968287244 =
+0.968287244361984016049539446938120421179794516;
const real_t KP503537032 =
+0.503537032863766627246873853868466977093348562;
const real_t KP251768516 =
+0.251768516431883313623436926934233488546674281;
const real_t KP581704778 =
+0.581704778510515730456870384989698884939833902;
const real_t KP859542535 =
+0.859542535098774820163672132761689612766401925;
const real_t KP083333333 =
+0.083333333333333333333333333333333333333333333;
const real_t KP957805992 =
+0.957805992594665126462521754605754580515587217;
const real_t KP522026385 =
+0.522026385161275033714027226654165028300441940;
const real_t KP853480001 =
+0.853480001859823990758994934970528322872359049;
const real_t KP769338817 =
+0.769338817572980603471413688209101117038278899;
const real_t KP612264650 =
+0.612264650376756543746494474777125408779395514;
const real_t KP038632954 =
+0.038632954644348171955506895830342264440241080;
const real_t KP302775637 =
+0.302775637731994646559610633735247973125648287;
const real_t KP514918778 =
+0.514918778086315755491789696138117261566051239;
const real_t KP686558370 =
+0.686558370781754340655719594850823015421401653;
const real_t KP226109445 =
+0.226109445035782405468510155372505010481906348;
const real_t KP301479260 =
+0.301479260047709873958013540496673347309208464;
const real_t KP866025403 =
+0.866025403784438646763723170752936183471402627;
const real_t KP500000000 =
+0.500000000000000000000000000000000000000000000;
{
real_t T1C, T1A, T1t, T1B, T2B, T2H, T2I, T2G;
{
real_t T1P, T1, T2n, T2o, To, TH, T2h, T2k, TE, TB, TF, Tw,
T2j, T2c, T1m;
real_t T1W, T1X, T1c, T19, T1j, T12, T1f, T21, T24, T27, T1U;
T1P = v0.y;
T1 = v0.x;
{
real_t TK, TL, T16, TY, TZ, T13, TW, TV, TN, TO, TQ, TR,
T2b, Tv, Ts;
real_t T2a;
{
real_t T2d, Tf, Tq, Ty, Tb, Tr, T6, Tx, Ti, Tt, Tu,
Tl;
{
real_t T7, T8, T9, Td, Te;
TK = v8.y;
Td = v8.x;
Te = v5.x;
TL = v5.y;
T16 = v12.y;
T7 = v12.x;
T8 = v10.x;
TY = v10.y;
TZ = v4.y;
T9 = v4.x;
T2d = Td - Te;
Tf = Td + Te;
{
real_t T2, Ta, T3, T4;
T2 = v1.x;
T13 = v1.y;
Ta = T8 + T9;
Tq = T8 - T9;
TW = v3.y;
T3 = v3.x;
T4 = v9.x;
TV = v9.y;
{
real_t Tg, T5, Th, Tj, Tk;
TN = v11.y;
Tg = v11.x;
Ty = fma (KP500000000, Ta, -(T7));
Tb = T7 + Ta;
Tr = T4 - T3;
T5 = T3 + T4;
Th = v6.x;
TO = v6.y;
TQ = v7.y;
Tj = v7.x;
Tk = v2.x;
TR = v2.y;
T6 = T2 + T5;
Tx = fma (-KP500000000, T5, T2);
Ti = Tg + Th;
Tt = Tg - Th;
Tu = Tj - Tk;
Tl = Tj + Tk;
}
}
}
{
real_t Tc, Tm, T2e, T2g;
Tc = T6 + Tb;
T2n = T6 - Tb;
T2b = Ti - Tl;
Tm = Ti + Tl;
T2e = Tt + Tu;
Tv = Tt - Tu;
Ts = Tq - Tr;
T2g = Tr + Tq;
{
real_t Tz, TA, Tn, T2f;
Tz = Tx - Ty;
T2a = Tx + Ty;
TA = fma (-KP500000000, Tm, Tf);
Tn = Tf + Tm;
T2f = fma (-KP500000000, T2e, T2d);
T2o = T2d + T2e;
To = Tc + Tn;
TH = Tc - Tn;
T2h = fma (KP866025403, T2g, T2f);
T2k = fma (-KP866025403, T2g, T2f);
TE = Tz - TA;
TB = Tz + TA;
}
}
}
{
real_t T1R, TM, T10, T18, T1l, TX, T1k, T15, TP, T1a,
T1b, TS, T17, T14;
TF = Ts - Tv;
Tw = Ts + Tv;
T2j = fma (-KP866025403, T2b, T2a);
T2c = fma (KP866025403, T2b, T2a);
T1R = TK + TL;
TM = TK - TL;
T17 = TY + TZ;
T10 = TY - TZ;
T18 = fma (KP500000000, T17, -(T16));
T1l = T16 + T17;
TX = TV - TW;
T14 = TW + TV;
T1k = T13 + T14;
T15 = fma (-KP500000000, T14, T13);
TP = TN - TO;
T1a = TN + TO;
T1b = TQ + TR;
TS = TQ - TR;
{
real_t T1Q, T11, TT, T1S;
T1Q = T1k + T1l;
T1m = T1k - T1l;
T11 = TX + T10;
T1W = T10 - TX;
T1X = TP - TS;
TT = TP + TS;
T1S = T1a + T1b;
T1c = T1a - T1b;
{
real_t T1Z, TU, T1T, T20;
T19 = T15 + T18;
T1Z = T15 - T18;
T1j = TM + TT;
TU = fma (-KP500000000, TT, TM);
T1T = T1R + T1S;
T20 = fma (-KP500000000, T1S, T1R);
T12 = fma (KP866025403, T11, TU);
T1f = fma (-KP866025403, T11, TU);
T21 = T1Z + T20;
T24 = T1Z - T20;
T27 = T1Q - T1T;
T1U = T1Q + T1T;
}
}
}
}
{
real_t T1g, T1d, T25, T1Y;
T1g = fma (-KP866025403, T1c, T19);
T1d = fma (KP866025403, T1c, T19);
T25 = T1W - T1X;
T1Y = T1W + T1X;
v0.y = T1P + T1U;
v0.x = T1 + To;
{
real_t T1O, T1o, TJ, T1N, T1L, T1F, T1K, T1M;
{
real_t TC, T1J, T1z, T1w, T1I, Tp, T1E, T1q, TI,
T1s;
{
real_t TG, T1n, T1G, T1u, T1e, T1h, T1v, T1x,
T1y, T1H, T1i;
TC = fma (KP301479260, TB, Tw);
T1x = fma (-KP226109445, Tw, TB);
T1y = fma (KP686558370, TE, TF);
TG = fma (-KP514918778, TF, TE);
T1n = fma (-KP302775637, T1m, T1j);
T1G = fma (KP302775637, T1j, T1m);
T1u = fma (-KP038632954, T12, T1d);
T1e = fma (KP038632954, T1d, T12);
T1h = fma (KP612264650, T1g, T1f);
T1v = fma (-KP612264650, T1f, T1g);
T1J = fma (KP769338817, T1y, T1x);
T1z = fma (-KP769338817, T1y, T1x);
T1H = fma (-KP853480001, T1v, T1u);
T1w = fma (KP853480001, T1v, T1u);
T1I = fma (-KP522026385, T1H, T1G);
T1O = fma (KP957805992, T1G, T1H);
Tp = fma (-KP083333333, To, T1);
T1E = fma (KP853480001, T1h, T1e);
T1i = fma (-KP853480001, T1h, T1e);
T1q = fma (-KP859542535, TG, TH);
TI = fma (KP581704778, TH, TG);
T1o = fma (KP957805992, T1n, T1i);
T1s = fma (-KP522026385, T1i, T1n);
}
{
real_t T1D, T1p, TD, T1r;
T1p = fma (-KP251768516, TC, Tp);
TD = fma (KP503537032, TC, Tp);
T1C = fma (-KP968287244, T1z, T1w);
T1A = fma (KP968287244, T1z, T1w);
TJ = fma (KP516520780, TI, TD);
T1N = fma (-KP516520780, TI, TD);
T1D = fma (-KP300462606, T1q, T1p);
T1r = fma (KP300462606, T1q, T1p);
T1t = fma (-KP575140729, T1s, T1r);
T1B = fma (KP575140729, T1s, T1r);
T1L = fma (-KP520028571, T1E, T1D);
T1F = fma (KP520028571, T1E, T1D);
T1K = fma (KP875502302, T1J, T1I);
T1M = fma (-KP875502302, T1J, T1I);
}
}
{
real_t T22, T2F, T2N, T2K, T2w, T2A, T1V, T2C,
T28, T2y, T2M, T2q;
{
real_t T26, T2v, T2p, T2i, T2s, T2t, T2l, T2D,
T2E, T2u, T2m;
T2D = fma (-KP226109445, T1Y, T21);
T22 = fma (KP301479260, T21, T1Y);
T26 = fma (-KP514918778, T25, T24);
T2E = fma (KP686558370, T24, T25);
T2v = fma (-KP302775637, T2n, T2o);
T2p = fma (KP302775637, T2o, T2n);
T2i = fma (-KP038632954, T2h, T2c);
T2s = fma (KP038632954, T2c, T2h);
T2t = fma (KP612264650, T2j, T2k);
T2l = fma (-KP612264650, T2k, T2j);
T2F = fma (-KP769338817, T2E, T2D);
T2N = fma (KP769338817, T2E, T2D);
T2K = fma (KP853480001, T2t, T2s);
T2u = fma (-KP853480001, T2t, T2s);
T2w = fma (KP957805992, T2v, T2u);
T2A = fma (-KP522026385, T2u, T2v);
T1V = fma (-KP083333333, T1U, T1P);
T2m = fma (-KP853480001, T2l, T2i);
T2C = fma (KP853480001, T2l, T2i);
T28 = fma (KP581704778, T27, T26);
T2y = fma (-KP859542535, T26, T27);
T2M = fma (-KP522026385, T2m, T2p);
T2q = fma (KP957805992, T2p, T2m);
}
{
real_t T2O, T2Q, T2z, T2P, T2L;
{
real_t T23, T2x, T2r, T29, T2J;
T23 = fma (KP503537032, T22, T1V);
T2x = fma (-KP251768516, T22, T1V);
T2O = fma (-KP875502302, T2N, T2M);
T2Q = fma (KP875502302, T2N, T2M);
T2r = fma (KP516520780, T28, T23);
T29 = fma (-KP516520780, T28, T23);
T2z = fma (KP300462606, T2y, T2x);
T2J = fma (-KP300462606, T2y, T2x);
v12.x = fma (KP600477271, T1o, TJ);
v12.y = fma (-KP600477271, T2w, T2r);
v1.y = fma (KP600477271, T2w, T2r);
v1.x = fma (-KP600477271, T1o, TJ);
v8.x = fma (-KP600477271, T1O, T1N);
v8.y = fma (KP600477271, T2q, T29);
v5.y = fma (-KP600477271, T2q, T29);
v5.x = fma (KP600477271, T1O, T1N);
T2P = fma (KP520028571, T2K, T2J);
T2L = fma (-KP520028571, T2K, T2J);
}
T2B = fma (KP575140729, T2A, T2z);
T2H = fma (-KP575140729, T2A, T2z);
v11.x = fma (-KP575140729, T1M, T1L);
v11.y = fma (KP575140729, T2Q, T2P);
v6.y = fma (-KP575140729, T2Q, T2P);
v6.x = fma (KP575140729, T1M, T1L);
v7.x = fma (-KP575140729, T1K, T1F);
v7.y = fma (KP575140729, T2O, T2L);
v2.y = fma (-KP575140729, T2O, T2L);
v2.x = fma (KP575140729, T1K, T1F);
T2I = fma (KP968287244, T2F, T2C);
T2G = fma (-KP968287244, T2F, T2C);
}
}
}
}
}
v10.x = fma (-KP520028571, T1C, T1B);
v10.y = fma (KP520028571, T2I, T2H);
v4.y = fma (-KP520028571, T2I, T2H);
v4.x = fma (KP520028571, T1C, T1B);
v9.x = fma (-KP520028571, T1A, T1t);
v9.y = fma (KP520028571, T2G, T2B);
v3.y = fma (-KP520028571, T2G, T2B);
v3.x = fma (KP520028571, T1A, T1t);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
*u11 = v11;
*u12 = v12;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 13;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
double2 v9 = x[9 * threads];
double2 v10 = x[10 * threads];
double2 v11 = x[11 * threads];
double2 v12 = x[12 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.483321946706122 * k / p));
v2 = mul(v2, twiddle((double)-0.966643893412244 * k / p));
v3 = mul(v3, twiddle((double)-1.449965840118366 * k / p));
v4 = mul(v4, twiddle((double)-1.933287786824488 * k / p));
v5 = mul(v5, twiddle((double)-2.41660973353061 * k / p));
v6 = mul(v6, twiddle((double)-2.899931680236732 * k / p));
v7 = mul(v7, twiddle((double)-3.383253626942854 * k / p));
v8 = mul(v8, twiddle((double)-3.866575573648976 * k / p));
v9 = mul(v9, twiddle((double)-4.349897520355098 * k / p));
v10 = mul(v10, twiddle((double)-4.833219467061221 * k / p));
v11 = mul(v11, twiddle((double)-5.316541413767341 * k / p));
v12 = mul(v12, twiddle((double)-5.799863360473465 * k / p));
}
dft13(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12);
const size_t j = k + (i - k) * 13;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
y[11 * p] = v11;
y[12 * p] = v12;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 2 -name dft2 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 4 FP additions, 0 FP multiplications,
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
* 6 stack variables, 0 constants, and 8 memory accesses
*/
DEVICE void
dft2 (real2_t * u0, real2_t * u1)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
{
{
real_t T3, T1, T2, T4;
T3 = v0.y;
T1 = v0.x;
T2 = v1.x;
T4 = v1.y;
v0.x = T1 + T2;
v0.y = T3 + T4;
v1.y = T3 - T4;
v1.x = T1 - T2;
}
}
*u0 = v0;
*u1 = v1;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 2;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-3.141592653589793 * k / p));
}
dft2(&v0, &v1);
const size_t j = k + (i - k) * 2;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 32 FP additions, 18 FP multiplications,
* (or, 14 additions, 0 multiplications, 18 fused multiply/add),
* 41 stack variables, 4 constants, and 20 memory accesses
*/
DEVICE void
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
{
const real_t KP951056516 =
+0.951056516295153572116439333379382143405698634;
const real_t KP559016994 =
+0.559016994374947424102293417182819058860154590;
const real_t KP250000000 =
+0.250000000000000000000000000000000000000000000;
const real_t KP618033988 =
+0.618033988749894848204586834365638117720309180;
{
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv;
{
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9;
Tl = v0.y;
T1 = v0.x;
{
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7;
Tc = v1.y;
T2 = v1.x;
T3 = v4.x;
Td = v4.y;
Tf = v2.y;
T5 = v2.x;
T6 = v3.x;
Tg = v3.y;
Ts = T2 - T3;
T4 = T2 + T3;
Tt = T5 - T6;
T7 = T5 + T6;
T8 = T4 + T7;
Ta = T4 - T7;
Te = Tc - Td;
Tm = Tc + Td;
Tn = Tf + Tg;
Th = Tf - Tg;
}
To = Tm + Tn;
Tq = Tm - Tn;
Ti = fma (KP618033988, Th, Te);
Tk = fma (-KP618033988, Te, Th);
v0.y = Tl + To;
v0.x = T1 + T8;
T9 = fma (-KP250000000, T8, T1);
Tu = fma (KP618033988, Tt, Ts);
Tw = fma (-KP618033988, Ts, Tt);
Tp = fma (-KP250000000, To, Tl);
Tb = fma (KP559016994, Ta, T9);
Tj = fma (-KP559016994, Ta, T9);
}
Tr = fma (KP559016994, Tq, Tp);
Tv = fma (-KP559016994, Tq, Tp);
v2.x = fma (KP951056516, Tk, Tj);
v2.y = fma (-KP951056516, Tw, Tv);
v3.y = fma (KP951056516, Tw, Tv);
v3.x = fma (-KP951056516, Tk, Tj);
v4.x = fma (KP951056516, Ti, Tb);
v4.y = fma (-KP951056516, Tu, Tr);
v1.y = fma (KP951056516, Tu, Tr);
v1.x = fma (-KP951056516, Ti, Tb);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 5;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p));
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p));
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p));
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p));
}
dft5(&v0, &v1, &v2, &v3, &v4);
const size_t j = k + (i - k) * 5;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 13 -name dft13 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 176 FP additions, 114 FP multiplications,
* (or, 62 additions, 0 multiplications, 114 fused multiply/add),
* 114 stack variables, 25 constants, and 52 memory accesses
*/
DEVICE void
dft13 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10, real2_t * u11, real2_t * u12)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
real2_t v11 = *u11;
real2_t v12 = *u12;
{
const real_t KP600477271 =
+0.600477271932665282925769253334763009352012849;
const real_t KP875502302 =
+0.875502302409147941146295545768755143177842006;
const real_t KP520028571 =
+0.520028571888864619117130500499232802493238139;
const real_t KP575140729 =
+0.575140729474003121368385547455453388461001608;
const real_t KP300462606 =
+0.300462606288665774426601772289207995520941381;
const real_t KP516520780 =
+0.516520780623489722840901288569017135705033622;
const real_t KP968287244 =
+0.968287244361984016049539446938120421179794516;
const real_t KP503537032 =
+0.503537032863766627246873853868466977093348562;
const real_t KP251768516 =
+0.251768516431883313623436926934233488546674281;
const real_t KP581704778 =
+0.581704778510515730456870384989698884939833902;
const real_t KP859542535 =
+0.859542535098774820163672132761689612766401925;
const real_t KP083333333 =
+0.083333333333333333333333333333333333333333333;
const real_t KP957805992 =
+0.957805992594665126462521754605754580515587217;
const real_t KP522026385 =
+0.522026385161275033714027226654165028300441940;
const real_t KP853480001 =
+0.853480001859823990758994934970528322872359049;
const real_t KP769338817 =
+0.769338817572980603471413688209101117038278899;
const real_t KP612264650 =
+0.612264650376756543746494474777125408779395514;
const real_t KP038632954 =
+0.038632954644348171955506895830342264440241080;
const real_t KP302775637 =
+0.302775637731994646559610633735247973125648287;
const real_t KP514918778 =
+0.514918778086315755491789696138117261566051239;
const real_t KP686558370 =
+0.686558370781754340655719594850823015421401653;
const real_t KP226109445 =
+0.226109445035782405468510155372505010481906348;
const real_t KP301479260 =
+0.301479260047709873958013540496673347309208464;
const real_t KP866025403 =
+0.866025403784438646763723170752936183471402627;
const real_t KP500000000 =
+0.500000000000000000000000000000000000000000000;
{
real_t T1C, T1A, T1t, T1B, T2B, T2H, T2I, T2G;
{
real_t T1P, T1, T2n, T2o, To, TH, T2h, T2k, TE, TB, TF, Tw,
T2j, T2c, T1m;
real_t T1W, T1X, T1c, T19, T1j, T12, T1f, T21, T24, T27, T1U;
T1P = v0.y;
T1 = v0.x;
{
real_t TK, TL, T16, TY, TZ, T13, TW, TV, TN, TO, TQ, TR,
T2b, Tv, Ts;
real_t T2a;
{
real_t T2d, Tf, Tq, Ty, Tb, Tr, T6, Tx, Ti, Tt, Tu,
Tl;
{
real_t T7, T8, T9, Td, Te;
TK = v8.y;
Td = v8.x;
Te = v5.x;
TL = v5.y;
T16 = v12.y;
T7 = v12.x;
T8 = v10.x;
TY = v10.y;
TZ = v4.y;
T9 = v4.x;
T2d = Td - Te;
Tf = Td + Te;
{
real_t T2, Ta, T3, T4;
T2 = v1.x;
T13 = v1.y;
Ta = T8 + T9;
Tq = T8 - T9;
TW = v3.y;
T3 = v3.x;
T4 = v9.x;
TV = v9.y;
{
real_t Tg, T5, Th, Tj, Tk;
TN = v11.y;
Tg = v11.x;
Ty = fma (KP500000000, Ta, -(T7));
Tb = T7 + Ta;
Tr = T4 - T3;
T5 = T3 + T4;
Th = v6.x;
TO = v6.y;
TQ = v7.y;
Tj = v7.x;
Tk = v2.x;
TR = v2.y;
T6 = T2 + T5;
Tx = fma (-KP500000000, T5, T2);
Ti = Tg + Th;
Tt = Tg - Th;
Tu = Tj - Tk;
Tl = Tj + Tk;
}
}
}
{
real_t Tc, Tm, T2e, T2g;
Tc = T6 + Tb;
T2n = T6 - Tb;
T2b = Ti - Tl;
Tm = Ti + Tl;
T2e = Tt + Tu;
Tv = Tt - Tu;
Ts = Tq - Tr;
T2g = Tr + Tq;
{
real_t Tz, TA, Tn, T2f;
Tz = Tx - Ty;
T2a = Tx + Ty;
TA = fma (-KP500000000, Tm, Tf);
Tn = Tf + Tm;
T2f = fma (-KP500000000, T2e, T2d);
T2o = T2d + T2e;
To = Tc + Tn;
TH = Tc - Tn;
T2h = fma (KP866025403, T2g, T2f);
T2k = fma (-KP866025403, T2g, T2f);
TE = Tz - TA;
TB = Tz + TA;
}
}
}
{
real_t T1R, TM, T10, T18, T1l, TX, T1k, T15, TP, T1a,
T1b, TS, T17, T14;
TF = Ts - Tv;
Tw = Ts + Tv;
T2j = fma (-KP866025403, T2b, T2a);
T2c = fma (KP866025403, T2b, T2a);
T1R = TK + TL;
TM = TK - TL;
T17 = TY + TZ;
T10 = TY - TZ;
T18 = fma (KP500000000, T17, -(T16));
T1l = T16 + T17;
TX = TV - TW;
T14 = TW + TV;
T1k = T13 + T14;
T15 = fma (-KP500000000, T14, T13);
TP = TN - TO;
T1a = TN + TO;
T1b = TQ + TR;
TS = TQ - TR;
{
real_t T1Q, T11, TT, T1S;
T1Q = T1k + T1l;
T1m = T1k - T1l;
T11 = TX + T10;
T1W = T10 - TX;
T1X = TP - TS;
TT = TP + TS;
T1S = T1a + T1b;
T1c = T1a - T1b;
{
real_t T1Z, TU, T1T, T20;
T19 = T15 + T18;
T1Z = T15 - T18;
T1j = TM + TT;
TU = fma (-KP500000000, TT, TM);
T1T = T1R + T1S;
T20 = fma (-KP500000000, T1S, T1R);
T12 = fma (KP866025403, T11, TU);
T1f = fma (-KP866025403, T11, TU);
T21 = T1Z + T20;
T24 = T1Z - T20;
T27 = T1Q - T1T;
T1U = T1Q + T1T;
}
}
}
}
{
real_t T1g, T1d, T25, T1Y;
T1g = fma (-KP866025403, T1c, T19);
T1d = fma (KP866025403, T1c, T19);
T25 = T1W - T1X;
T1Y = T1W + T1X;
v0.y = T1P + T1U;
v0.x = T1 + To;
{
real_t T1O, T1o, TJ, T1N, T1L, T1F, T1K, T1M;
{
real_t TC, T1J, T1z, T1w, T1I, Tp, T1E, T1q, TI,
T1s;
{
real_t TG, T1n, T1G, T1u, T1e, T1h, T1v, T1x,
T1y, T1H, T1i;
TC = fma (KP301479260, TB, Tw);
T1x = fma (-KP226109445, Tw, TB);
T1y = fma (KP686558370, TE, TF);
TG = fma (-KP514918778, TF, TE);
T1n = fma (-KP302775637, T1m, T1j);
T1G = fma (KP302775637, T1j, T1m);
T1u = fma (-KP038632954, T12, T1d);
T1e = fma (KP038632954, T1d, T12);
T1h = fma (KP612264650, T1g, T1f);
T1v = fma (-KP612264650, T1f, T1g);
T1J = fma (KP769338817, T1y, T1x);
T1z = fma (-KP769338817, T1y, T1x);
T1H = fma (-KP853480001, T1v, T1u);
T1w = fma (KP853480001, T1v, T1u);
T1I = fma (-KP522026385, T1H, T1G);
T1O = fma (KP957805992, T1G, T1H);
Tp = fma (-KP083333333, To, T1);
T1E = fma (KP853480001, T1h, T1e);
T1i = fma (-KP853480001, T1h, T1e);
T1q = fma (-KP859542535, TG, TH);
TI = fma (KP581704778, TH, TG);
T1o = fma (KP957805992, T1n, T1i);
T1s = fma (-KP522026385, T1i, T1n);
}
{
real_t T1D, T1p, TD, T1r;
T1p = fma (-KP251768516, TC, Tp);
TD = fma (KP503537032, TC, Tp);
T1C = fma (KP968287244, T1z, T1w);
T1A = fma (-KP968287244, T1z, T1w);
TJ = fma (KP516520780, TI, TD);
T1N = fma (-KP516520780, TI, TD);
T1D = fma (-KP300462606, T1q, T1p);
T1r = fma (KP300462606, T1q, T1p);
T1t = fma (KP575140729, T1s, T1r);
T1B = fma (-KP575140729, T1s, T1r);
T1L = fma (KP520028571, T1E, T1D);
T1F = fma (-KP520028571, T1E, T1D);
T1K = fma (-KP875502302, T1J, T1I);
T1M = fma (KP875502302, T1J, T1I);
}
}
{
real_t T22, T2F, T2N, T2K, T2w, T2A, T1V, T2C,
T28, T2y, T2M, T2q;
{
real_t T26, T2v, T2p, T2i, T2s, T2t, T2l, T2D,
T2E, T2u, T2m;
T2D = fma (-KP226109445, T1Y, T21);
T22 = fma (KP301479260, T21, T1Y);
T26 = fma (-KP514918778, T25, T24);
T2E = fma (KP686558370, T24, T25);
T2v = fma (-KP302775637, T2n, T2o);
T2p = fma (KP302775637, T2o, T2n);
T2i = fma (-KP038632954, T2h, T2c);
T2s = fma (KP038632954, T2c, T2h);
T2t = fma (KP612264650, T2j, T2k);
T2l = fma (-KP612264650, T2k, T2j);
T2F = fma (-KP769338817, T2E, T2D);
T2N = fma (KP769338817, T2E, T2D);
T2K = fma (KP853480001, T2t, T2s);
T2u = fma (-KP853480001, T2t, T2s);
T2w = fma (KP957805992, T2v, T2u);
T2A = fma (-KP522026385, T2u, T2v);
T1V = fma (-KP083333333, T1U, T1P);
T2m = fma (-KP853480001, T2l, T2i);
T2C = fma (KP853480001, T2l, T2i);
T28 = fma (KP581704778, T27, T26);
T2y = fma (-KP859542535, T26, T27);
T2M = fma (-KP522026385, T2m, T2p);
T2q = fma (KP957805992, T2p, T2m);
}
{
real_t T2O, T2Q, T2z, T2P, T2L;
{
real_t T23, T2x, T2r, T29, T2J;
T23 = fma (KP503537032, T22, T1V);
T2x = fma (-KP251768516, T22, T1V);
T2O = fma (-KP875502302, T2N, T2M);
T2Q = fma (KP875502302, T2N, T2M);
T2r = fma (KP516520780, T28, T23);
T29 = fma (-KP516520780, T28, T23);
T2z = fma (KP300462606, T2y, T2x);
T2J = fma (-KP300462606, T2y, T2x);
v12.x = fma (-KP600477271, T1o, TJ);
v12.y = fma (KP600477271, T2w, T2r);
v1.y = fma (-KP600477271, T2w, T2r);
v1.x = fma (KP600477271, T1o, TJ);
v8.x = fma (KP600477271, T1O, T1N);
v8.y = fma (-KP600477271, T2q, T29);
v5.y = fma (KP600477271, T2q, T29);
v5.x = fma (-KP600477271, T1O, T1N);
T2P = fma (KP520028571, T2K, T2J);
T2L = fma (-KP520028571, T2K, T2J);
}
T2B = fma (KP575140729, T2A, T2z);
T2H = fma (-KP575140729, T2A, T2z);
v2.x = fma (-KP575140729, T1K, T1F);
v2.y = fma (KP575140729, T2Q, T2P);
v7.y = fma (-KP575140729, T2Q, T2P);
v7.x = fma (KP575140729, T1K, T1F);
v6.x = fma (-KP575140729, T1M, T1L);
v6.y = fma (KP575140729, T2O, T2L);
v11.y = fma (-KP575140729, T2O, T2L);
v11.x = fma (KP575140729, T1M, T1L);
T2I = fma (KP968287244, T2F, T2C);
T2G = fma (-KP968287244, T2F, T2C);
}
}
}
}
}
v3.x = fma (-KP520028571, T1A, T1t);
v3.y = fma (KP520028571, T2I, T2H);
v9.y = fma (-KP520028571, T2I, T2H);
v9.x = fma (KP520028571, T1A, T1t);
v4.x = fma (-KP520028571, T1C, T1B);
v4.y = fma (KP520028571, T2G, T2B);
v10.y = fma (-KP520028571, T2G, T2B);
v10.x = fma (KP520028571, T1C, T1B);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
*u11 = v11;
*u12 = v12;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 13;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
double2 v9 = x[9 * threads];
double2 v10 = x[10 * threads];
double2 v11 = x[11 * threads];
double2 v12 = x[12 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.483321946706122 * k / p));
v2 = mul(v2, twiddle((double)-0.966643893412244 * k / p));
v3 = mul(v3, twiddle((double)-1.449965840118366 * k / p));
v4 = mul(v4, twiddle((double)-1.933287786824488 * k / p));
v5 = mul(v5, twiddle((double)-2.41660973353061 * k / p));
v6 = mul(v6, twiddle((double)-2.899931680236732 * k / p));
v7 = mul(v7, twiddle((double)-3.383253626942854 * k / p));
v8 = mul(v8, twiddle((double)-3.866575573648976 * k / p));
v9 = mul(v9, twiddle((double)-4.349897520355098 * k / p));
v10 = mul(v10, twiddle((double)-4.833219467061221 * k / p));
v11 = mul(v11, twiddle((double)-5.316541413767341 * k / p));
v12 = mul(v12, twiddle((double)-5.799863360473465 * k / p));
}
dft13(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12);
const size_t j = k + (i - k) * 13;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
y[11 * p] = v11;
y[12 * p] = v12;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 13 -name dft13 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 176 FP additions, 114 FP multiplications,
* (or, 62 additions, 0 multiplications, 114 fused multiply/add),
* 114 stack variables, 25 constants, and 52 memory accesses
*/
DEVICE void
dft13 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10, real2_t * u11, real2_t * u12)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
real2_t v11 = *u11;
real2_t v12 = *u12;
{
const real_t KP600477271 =
+0.600477271932665282925769253334763009352012849;
const real_t KP875502302 =
+0.875502302409147941146295545768755143177842006;
const real_t KP520028571 =
+0.520028571888864619117130500499232802493238139;
const real_t KP575140729 =
+0.575140729474003121368385547455453388461001608;
const real_t KP300462606 =
+0.300462606288665774426601772289207995520941381;
const real_t KP516520780 =
+0.516520780623489722840901288569017135705033622;
const real_t KP968287244 =
+0.968287244361984016049539446938120421179794516;
const real_t KP503537032 =
+0.503537032863766627246873853868466977093348562;
const real_t KP251768516 =
+0.251768516431883313623436926934233488546674281;
const real_t KP581704778 =
+0.581704778510515730456870384989698884939833902;
const real_t KP859542535 =
+0.859542535098774820163672132761689612766401925;
const real_t KP083333333 =
+0.083333333333333333333333333333333333333333333;
const real_t KP957805992 =
+0.957805992594665126462521754605754580515587217;
const real_t KP522026385 =
+0.522026385161275033714027226654165028300441940;
const real_t KP853480001 =
+0.853480001859823990758994934970528322872359049;
const real_t KP769338817 =
+0.769338817572980603471413688209101117038278899;
const real_t KP612264650 =
+0.612264650376756543746494474777125408779395514;
const real_t KP038632954 =
+0.038632954644348171955506895830342264440241080;
const real_t KP302775637 =
+0.302775637731994646559610633735247973125648287;
const real_t KP514918778 =
+0.514918778086315755491789696138117261566051239;
const real_t KP686558370 =
+0.686558370781754340655719594850823015421401653;
const real_t KP226109445 =
+0.226109445035782405468510155372505010481906348;
const real_t KP301479260 =
+0.301479260047709873958013540496673347309208464;
const real_t KP866025403 =
+0.866025403784438646763723170752936183471402627;
const real_t KP500000000 =
+0.500000000000000000000000000000000000000000000;
{
real_t T1C, T1A, T1t, T1B, T2B, T2H, T2I, T2G;
{
real_t T1P, T1, T2n, T2o, To, TH, T2h, T2k, TE, TB, TF, Tw,
T2j, T2c, T1m;
real_t T1W, T1X, T1c, T19, T1j, T12, T1f, T21, T24, T27, T1U;
T1P = v0.y;
T1 = v0.x;
{
real_t TK, TL, T16, TY, TZ, T13, TW, TV, TN, TO, TQ, TR,
T2b, Tv, Ts;
real_t T2a;
{
real_t T2d, Tf, Tq, Ty, Tb, Tr, T6, Tx, Ti, Tt, Tu,
Tl;
{
real_t T7, T8, T9, Td, Te;
TK = v8.y;
Td = v8.x;
Te = v5.x;
TL = v5.y;
T16 = v12.y;
T7 = v12.x;
T8 = v10.x;
TY = v10.y;
TZ = v4.y;
T9 = v4.x;
T2d = Td - Te;
Tf = Td + Te;
{
real_t T2, Ta, T3, T4;
T2 = v1.x;
T13 = v1.y;
Ta = T8 + T9;
Tq = T8 - T9;
TW = v3.y;
T3 = v3.x;
T4 = v9.x;
TV = v9.y;
{
real_t Tg, T5, Th, Tj, Tk;
TN = v11.y;
Tg = v11.x;
Ty = fma (KP500000000, Ta, -(T7));
Tb = T7 + Ta;
Tr = T4 - T3;
T5 = T3 + T4;
Th = v6.x;
TO = v6.y;
TQ = v7.y;
Tj = v7.x;
Tk = v2.x;
TR = v2.y;
T6 = T2 + T5;
Tx = fma (-KP500000000, T5, T2);
Ti = Tg + Th;
Tt = Tg - Th;
Tu = Tj - Tk;
Tl = Tj + Tk;
}
}
}
{
real_t Tc, Tm, T2e, T2g;
Tc = T6 + Tb;
T2n = T6 - Tb;
T2b = Ti - Tl;
Tm = Ti + Tl;
T2e = Tt + Tu;
Tv = Tt - Tu;
Ts = Tq - Tr;
T2g = Tr + Tq;
{
real_t Tz, TA, Tn, T2f;
Tz = Tx - Ty;
T2a = Tx + Ty;
TA = fma (-KP500000000, Tm, Tf);
Tn = Tf + Tm;
T2f = fma (-KP500000000, T2e, T2d);
T2o = T2d + T2e;
To = Tc + Tn;
TH = Tc - Tn;
T2h = fma (KP866025403, T2g, T2f);
T2k = fma (-KP866025403, T2g, T2f);
TE = Tz - TA;
TB = Tz + TA;
}
}
}
{
real_t T1R, TM, T10, T18, T1l, TX, T1k, T15, TP, T1a,
T1b, TS, T17, T14;
TF = Ts - Tv;
Tw = Ts + Tv;
T2j = fma (-KP866025403, T2b, T2a);
T2c = fma (KP866025403, T2b, T2a);
T1R = TK + TL;
TM = TK - TL;
T17 = TY + TZ;
T10 = TY - TZ;
T18 = fma (KP500000000, T17, -(T16));
T1l = T16 + T17;
TX = TV - TW;
T14 = TW + TV;
T1k = T13 + T14;
T15 = fma (-KP500000000, T14, T13);
TP = TN - TO;
T1a = TN + TO;
T1b = TQ + TR;
TS = TQ - TR;
{
real_t T1Q, T11, TT, T1S;
T1Q = T1k + T1l;
T1m = T1k - T1l;
T11 = TX + T10;
T1W = T10 - TX;
T1X = TP - TS;
TT = TP + TS;
T1S = T1a + T1b;
T1c = T1a - T1b;
{
real_t T1Z, TU, T1T, T20;
T19 = T15 + T18;
T1Z = T15 - T18;
T1j = TM + TT;
TU = fma (-KP500000000, TT, TM);
T1T = T1R + T1S;
T20 = fma (-KP500000000, T1S, T1R);
T12 = fma (KP866025403, T11, TU);
T1f = fma (-KP866025403, T11, TU);
T21 = T1Z + T20;
T24 = T1Z - T20;
T27 = T1Q - T1T;
T1U = T1Q + T1T;
}
}
}
}
{
real_t T1g, T1d, T25, T1Y;
T1g = fma (-KP866025403, T1c, T19);
T1d = fma (KP866025403, T1c, T19);
T25 = T1W - T1X;
T1Y = T1W + T1X;
v0.y = T1P + T1U;
v0.x = T1 + To;
{
real_t T1O, T1o, TJ, T1N, T1L, T1F, T1K, T1M;
{
real_t TC, T1J, T1z, T1w, T1I, Tp, T1E, T1q, TI,
T1s;
{
real_t TG, T1n, T1G, T1u, T1e, T1h, T1v, T1x,
T1y, T1H, T1i;
TC = fma (KP301479260, TB, Tw);
T1x = fma (-KP226109445, Tw, TB);
T1y = fma (KP686558370, TE, TF);
TG = fma (-KP514918778, TF, TE);
T1n = fma (-KP302775637, T1m, T1j);
T1G = fma (KP302775637, T1j, T1m);
T1u = fma (-KP038632954, T12, T1d);
T1e = fma (KP038632954, T1d, T12);
T1h = fma (KP612264650, T1g, T1f);
T1v = fma (-KP612264650, T1f, T1g);
T1J = fma (KP769338817, T1y, T1x);
T1z = fma (-KP769338817, T1y, T1x);
T1H = fma (-KP853480001, T1v, T1u);
T1w = fma (KP853480001, T1v, T1u);
T1I = fma (-KP522026385, T1H, T1G);
T1O = fma (KP957805992, T1G, T1H);
Tp = fma (-KP083333333, To, T1);
T1E = fma (KP853480001, T1h, T1e);
T1i = fma (-KP853480001, T1h, T1e);
T1q = fma (-KP859542535, TG, TH);
TI = fma (KP581704778, TH, TG);
T1o = fma (KP957805992, T1n, T1i);
T1s = fma (-KP522026385, T1i, T1n);
}
{
real_t T1D, T1p, TD, T1r;
T1p = fma (-KP251768516, TC, Tp);
TD = fma (KP503537032, TC, Tp);
T1C = fma (KP968287244, T1z, T1w);
T1A = fma (-KP968287244, T1z, T1w);
TJ = fma (KP516520780, TI, TD);
T1N = fma (-KP516520780, TI, TD);
T1D = fma (-KP300462606, T1q, T1p);
T1r = fma (KP300462606, T1q, T1p);
T1t = fma (KP575140729, T1s, T1r);
T1B = fma (-KP575140729, T1s, T1r);
T1L = fma (KP520028571, T1E, T1D);
T1F = fma (-KP520028571, T1E, T1D);
T1K = fma (-KP875502302, T1J, T1I);
T1M = fma (KP875502302, T1J, T1I);
}
}
{
real_t T22, T2F, T2N, T2K, T2w, T2A, T1V, T2C,
T28, T2y, T2M, T2q;
{
real_t T26, T2v, T2p, T2i, T2s, T2t, T2l, T2D,
T2E, T2u, T2m;
T2D = fma (-KP226109445, T1Y, T21);
T22 = fma (KP301479260, T21, T1Y);
T26 = fma (-KP514918778, T25, T24);
T2E = fma (KP686558370, T24, T25);
T2v = fma (-KP302775637, T2n, T2o);
T2p = fma (KP302775637, T2o, T2n);
T2i = fma (-KP038632954, T2h, T2c);
T2s = fma (KP038632954, T2c, T2h);
T2t = fma (KP612264650, T2j, T2k);
T2l = fma (-KP612264650, T2k, T2j);
T2F = fma (-KP769338817, T2E, T2D);
T2N = fma (KP769338817, T2E, T2D);
T2K = fma (KP853480001, T2t, T2s);
T2u = fma (-KP853480001, T2t, T2s);
T2w = fma (KP957805992, T2v, T2u);
T2A = fma (-KP522026385, T2u, T2v);
T1V = fma (-KP083333333, T1U, T1P);
T2m = fma (-KP853480001, T2l, T2i);
T2C = fma (KP853480001, T2l, T2i);
T28 = fma (KP581704778, T27, T26);
T2y = fma (-KP859542535, T26, T27);
T2M = fma (-KP522026385, T2m, T2p);
T2q = fma (KP957805992, T2p, T2m);
}
{
real_t T2O, T2Q, T2z, T2P, T2L;
{
real_t T23, T2x, T2r, T29, T2J;
T23 = fma (KP503537032, T22, T1V);
T2x = fma (-KP251768516, T22, T1V);
T2O = fma (-KP875502302, T2N, T2M);
T2Q = fma (KP875502302, T2N, T2M);
T2r = fma (KP516520780, T28, T23);
T29 = fma (-KP516520780, T28, T23);
T2z = fma (KP300462606, T2y, T2x);
T2J = fma (-KP300462606, T2y, T2x);
v12.x = fma (-KP600477271, T1o, TJ);
v12.y = fma (KP600477271, T2w, T2r);
v1.y = fma (-KP600477271, T2w, T2r);
v1.x = fma (KP600477271, T1o, TJ);
v8.x = fma (KP600477271, T1O, T1N);
v8.y = fma (-KP600477271, T2q, T29);
v5.y = fma (KP600477271, T2q, T29);
v5.x = fma (-KP600477271, T1O, T1N);
T2P = fma (KP520028571, T2K, T2J);
T2L = fma (-KP520028571, T2K, T2J);
}
T2B = fma (KP575140729, T2A, T2z);
T2H = fma (-KP575140729, T2A, T2z);
v2.x = fma (-KP575140729, T1K, T1F);
v2.y = fma (KP575140729, T2Q, T2P);
v7.y = fma (-KP575140729, T2Q, T2P);
v7.x = fma (KP575140729, T1K, T1F);
v6.x = fma (-KP575140729, T1M, T1L);
v6.y = fma (KP575140729, T2O, T2L);
v11.y = fma (-KP575140729, T2O, T2L);
v11.x = fma (KP575140729, T1M, T1L);
T2I = fma (KP968287244, T2F, T2C);
T2G = fma (-KP968287244, T2F, T2C);
}
}
}
}
}
v3.x = fma (-KP520028571, T1A, T1t);
v3.y = fma (KP520028571, T2I, T2H);
v9.y = fma (-KP520028571, T2I, T2H);
v9.x = fma (KP520028571, T1A, T1t);
v4.x = fma (-KP520028571, T1C, T1B);
v4.y = fma (KP520028571, T2G, T2B);
v10.y = fma (-KP520028571, T2G, T2B);
v10.x = fma (KP520028571, T1C, T1B);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
*u11 = v11;
*u12 = v12;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 13;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
double2 v9 = x[9 * threads];
double2 v10 = x[10 * threads];
double2 v11 = x[11 * threads];
double2 v12 = x[12 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.483321946706122 * k / p));
v2 = mul(v2, twiddle((double)-0.966643893412244 * k / p));
v3 = mul(v3, twiddle((double)-1.449965840118366 * k / p));
v4 = mul(v4, twiddle((double)-1.933287786824488 * k / p));
v5 = mul(v5, twiddle((double)-2.41660973353061 * k / p));
v6 = mul(v6, twiddle((double)-2.899931680236732 * k / p));
v7 = mul(v7, twiddle((double)-3.383253626942854 * k / p));
v8 = mul(v8, twiddle((double)-3.866575573648976 * k / p));
v9 = mul(v9, twiddle((double)-4.349897520355098 * k / p));
v10 = mul(v10, twiddle((double)-4.833219467061221 * k / p));
v11 = mul(v11, twiddle((double)-5.316541413767341 * k / p));
v12 = mul(v12, twiddle((double)-5.799863360473465 * k / p));
}
dft13(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12);
const size_t j = k + (i - k) * 13;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
y[11 * p] = v11;
y[12 * p] = v12;
}
/tmp/vexcl/tests/fft.cpp:94: error in "test_dimensions": absolute value of rms(back, inp){0.50411241533730589} exceeds 1e-08
FFT(C2C) size=10x2 batch=22
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 2 -name dft2 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 4 FP additions, 0 FP multiplications,
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
* 6 stack variables, 0 constants, and 8 memory accesses
*/
DEVICE void
dft2 (real2_t * u0, real2_t * u1)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
{
{
real_t T3, T1, T2, T4;
T3 = v0.y;
T1 = v0.x;
T2 = v1.x;
T4 = v1.y;
v0.x = T1 + T2;
v0.y = T3 + T4;
v1.y = T3 - T4;
v1.x = T1 - T2;
}
}
*u0 = v0;
*u1 = v1;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 2;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-3.141592653589793 * k / p));
}
dft2(&v0, &v1);
const size_t j = k + (i - k) * 2;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 2 -name dft2 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 4 FP additions, 0 FP multiplications,
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
* 6 stack variables, 0 constants, and 8 memory accesses
*/
DEVICE void
dft2 (real2_t * u0, real2_t * u1)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
{
{
real_t T3, T1, T2, T4;
T3 = v0.y;
T1 = v0.x;
T2 = v1.x;
T4 = v1.y;
v0.x = T1 + T2;
v0.y = T3 + T4;
v1.y = T3 - T4;
v1.x = T1 - T2;
}
}
*u0 = v0;
*u1 = v1;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 2;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-3.141592653589793 * k / p));
}
dft2(&v0, &v1);
const size_t j = k + (i - k) * 2;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 32 FP additions, 18 FP multiplications,
* (or, 14 additions, 0 multiplications, 18 fused multiply/add),
* 41 stack variables, 4 constants, and 20 memory accesses
*/
DEVICE void
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
{
const real_t KP951056516 =
+0.951056516295153572116439333379382143405698634;
const real_t KP559016994 =
+0.559016994374947424102293417182819058860154590;
const real_t KP250000000 =
+0.250000000000000000000000000000000000000000000;
const real_t KP618033988 =
+0.618033988749894848204586834365638117720309180;
{
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv;
{
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9;
Tl = v0.y;
T1 = v0.x;
{
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7;
Tc = v1.y;
T2 = v1.x;
T3 = v4.x;
Td = v4.y;
Tf = v2.y;
T5 = v2.x;
T6 = v3.x;
Tg = v3.y;
Ts = T2 - T3;
T4 = T2 + T3;
Tt = T5 - T6;
T7 = T5 + T6;
T8 = T4 + T7;
Ta = T4 - T7;
Te = Tc - Td;
Tm = Tc + Td;
Tn = Tf + Tg;
Th = Tf - Tg;
}
To = Tm + Tn;
Tq = Tm - Tn;
Ti = fma (KP618033988, Th, Te);
Tk = fma (-KP618033988, Te, Th);
v0.y = Tl + To;
v0.x = T1 + T8;
T9 = fma (-KP250000000, T8, T1);
Tu = fma (KP618033988, Tt, Ts);
Tw = fma (-KP618033988, Ts, Tt);
Tp = fma (-KP250000000, To, Tl);
Tb = fma (KP559016994, Ta, T9);
Tj = fma (-KP559016994, Ta, T9);
}
Tr = fma (KP559016994, Tq, Tp);
Tv = fma (-KP559016994, Tq, Tp);
v2.x = fma (-KP951056516, Tk, Tj);
v2.y = fma (KP951056516, Tw, Tv);
v3.y = fma (-KP951056516, Tw, Tv);
v3.x = fma (KP951056516, Tk, Tj);
v4.x = fma (-KP951056516, Ti, Tb);
v4.y = fma (KP951056516, Tu, Tr);
v1.y = fma (-KP951056516, Tu, Tr);
v1.x = fma (KP951056516, Ti, Tb);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 5;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p));
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p));
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p));
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p));
}
dft5(&v0, &v1, &v2, &v3, &v4);
const size_t j = k + (i - k) * 5;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 2 -name dft2 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 4 FP additions, 0 FP multiplications,
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
* 6 stack variables, 0 constants, and 8 memory accesses
*/
DEVICE void
dft2 (real2_t * u0, real2_t * u1)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
{
{
real_t T3, T1, T2, T4;
T3 = v0.y;
T1 = v0.x;
T2 = v1.x;
T4 = v1.y;
v0.x = T1 + T2;
v0.y = T3 + T4;
v1.y = T3 - T4;
v1.x = T1 - T2;
}
}
*u0 = v0;
*u1 = v1;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 2;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-3.141592653589793 * k / p));
}
dft2(&v0, &v1);
const size_t j = k + (i - k) * 2;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 2 -name dft2 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 4 FP additions, 0 FP multiplications,
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
* 6 stack variables, 0 constants, and 8 memory accesses
*/
DEVICE void
dft2 (real2_t * u0, real2_t * u1)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
{
{
real_t T3, T1, T2, T4;
T3 = v0.y;
T1 = v0.x;
T2 = v1.x;
T4 = v1.y;
v0.x = T1 + T2;
v0.y = T3 + T4;
v1.y = T3 - T4;
v1.x = T1 - T2;
}
}
*u0 = v0;
*u1 = v1;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 2;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-3.141592653589793 * k / p));
}
dft2(&v0, &v1);
const size_t j = k + (i - k) * 2;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 32 FP additions, 18 FP multiplications,
* (or, 14 additions, 0 multiplications, 18 fused multiply/add),
* 41 stack variables, 4 constants, and 20 memory accesses
*/
DEVICE void
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
{
const real_t KP951056516 =
+0.951056516295153572116439333379382143405698634;
const real_t KP559016994 =
+0.559016994374947424102293417182819058860154590;
const real_t KP250000000 =
+0.250000000000000000000000000000000000000000000;
const real_t KP618033988 =
+0.618033988749894848204586834365638117720309180;
{
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv;
{
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9;
Tl = v0.y;
T1 = v0.x;
{
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7;
Tc = v1.y;
T2 = v1.x;
T3 = v4.x;
Td = v4.y;
Tf = v2.y;
T5 = v2.x;
T6 = v3.x;
Tg = v3.y;
Ts = T2 - T3;
T4 = T2 + T3;
Tt = T5 - T6;
T7 = T5 + T6;
T8 = T4 + T7;
Ta = T4 - T7;
Te = Tc - Td;
Tm = Tc + Td;
Tn = Tf + Tg;
Th = Tf - Tg;
}
To = Tm + Tn;
Tq = Tm - Tn;
Ti = fma (KP618033988, Th, Te);
Tk = fma (-KP618033988, Te, Th);
v0.y = Tl + To;
v0.x = T1 + T8;
T9 = fma (-KP250000000, T8, T1);
Tu = fma (KP618033988, Tt, Ts);
Tw = fma (-KP618033988, Ts, Tt);
Tp = fma (-KP250000000, To, Tl);
Tb = fma (KP559016994, Ta, T9);
Tj = fma (-KP559016994, Ta, T9);
}
Tr = fma (KP559016994, Tq, Tp);
Tv = fma (-KP559016994, Tq, Tp);
v2.x = fma (KP951056516, Tk, Tj);
v2.y = fma (-KP951056516, Tw, Tv);
v3.y = fma (KP951056516, Tw, Tv);
v3.x = fma (-KP951056516, Tk, Tj);
v4.x = fma (KP951056516, Ti, Tb);
v4.y = fma (-KP951056516, Tu, Tr);
v1.y = fma (KP951056516, Tu, Tr);
v1.x = fma (-KP951056516, Ti, Tb);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 5;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p));
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p));
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p));
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p));
}
dft5(&v0, &v1, &v2, &v3, &v4);
const size_t j = k + (i - k) * 5;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
/tmp/vexcl/tests/fft.cpp:94: error in "test_dimensions": absolute value of rms(back, inp){0.29894604660804136} exceeds 1e-08
FFT(C2C) size=100 batch=4
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 16 FP additions, 0 FP multiplications,
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
* 16 stack variables, 0 constants, and 16 memory accesses
*/
DEVICE void
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
{
{
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td;
{
real_t T7, T1, T2, T8;
T7 = v0.y;
T1 = v0.x;
T2 = v2.x;
T8 = v2.y;
Tc = v1.y;
T4 = v1.x;
Tb = T1 - T2;
T3 = T1 + T2;
Tf = T7 + T8;
T9 = T7 - T8;
T5 = v3.x;
Td = v3.y;
}
{
real_t T6, Ta, Te, Tg;
T6 = T4 + T5;
Ta = T4 - T5;
Te = Tc - Td;
Tg = Tc + Td;
v0.x = T3 + T6;
v0.y = Tf + Tg;
v2.y = Tf - Tg;
v2.x = T3 - T6;
v3.y = Ta + T9;
v3.x = Tb - Te;
v1.x = Tb + Te;
v1.y = T9 - Ta;
}
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 4;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p));
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p));
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p));
}
dft4(&v0, &v1, &v2, &v3);
const size_t j = k + (i - k) * 4;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 25 -name dft25 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 352 FP additions, 268 FP multiplications,
* (or, 84 additions, 0 multiplications, 268 fused multiply/add),
* 188 stack variables, 47 constants, and 100 memory accesses
*/
DEVICE void
dft25 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10, real2_t * u11, real2_t * u12, real2_t * u13,
real2_t * u14, real2_t * u15, real2_t * u16, real2_t * u17,
real2_t * u18, real2_t * u19, real2_t * u20, real2_t * u21,
real2_t * u22, real2_t * u23, real2_t * u24)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
real2_t v11 = *u11;
real2_t v12 = *u12;
real2_t v13 = *u13;
real2_t v14 = *u14;
real2_t v15 = *u15;
real2_t v16 = *u16;
real2_t v17 = *u17;
real2_t v18 = *u18;
real2_t v19 = *u19;
real2_t v20 = *u20;
real2_t v21 = *u21;
real2_t v22 = *u22;
real2_t v23 = *u23;
real2_t v24 = *u24;
{
const real_t KP803003575 =
+0.803003575438660414833440593570376004635464850;
const real_t KP554608978 =
+0.554608978404018097464974850792216217022558774;
const real_t KP992114701 =
+0.992114701314477831049793042785778521453036709;
const real_t KP248028675 =
+0.248028675328619457762448260696444630363259177;
const real_t KP726211448 =
+0.726211448929902658173535992263577167607493062;
const real_t KP525970792 =
+0.525970792408939708442463226536226366643874659;
const real_t KP851038619 =
+0.851038619207379630836264138867114231259902550;
const real_t KP912575812 =
+0.912575812670962425556968549836277086778922727;
const real_t KP912018591 =
+0.912018591466481957908415381764119056233607330;
const real_t KP943557151 =
+0.943557151597354104399655195398983005179443399;
const real_t KP614372930 =
+0.614372930789563808870829930444362096004872855;
const real_t KP621716863 =
+0.621716863012209892444754556304102309693593202;
const real_t KP994076283 =
+0.994076283785401014123185814696322018529298887;
const real_t KP734762448 =
+0.734762448793050413546343770063151342619912334;
const real_t KP772036680 =
+0.772036680810363904029489473607579825330539880;
const real_t KP126329378 =
+0.126329378446108174786050455341811215027378105;
const real_t KP827271945 =
+0.827271945972475634034355757144307982555673741;
const real_t KP860541664 =
+0.860541664367944677098261680920518816412804187;
const real_t KP949179823 =
+0.949179823508441261575555465843363271711583843;
const real_t KP557913902 =
+0.557913902031834264187699648465567037992437152;
const real_t KP998026728 =
+0.998026728428271561952336806863450553336905220;
const real_t KP249506682 =
+0.249506682107067890488084201715862638334226305;
const real_t KP681693190 =
+0.681693190061530575150324149145440022633095390;
const real_t KP560319534 =
+0.560319534973832390111614715371676131169633784;
const real_t KP906616052 =
+0.906616052148196230441134447086066874408359177;
const real_t KP968479752 =
+0.968479752739016373193524836781420152702090879;
const real_t KP845997307 =
+0.845997307939530944175097360758058292389769300;
const real_t KP470564281 =
+0.470564281212251493087595091036643380879947982;
const real_t KP062914667 =
+0.062914667253649757225485955897349402364686947;
const real_t KP921177326 =
+0.921177326965143320250447435415066029359282231;
const real_t KP833417178 =
+0.833417178328688677408962550243238843138996060;
const real_t KP541454447 =
+0.541454447536312777046285590082819509052033189;
const real_t KP242145790 =
+0.242145790282157779872542093866183953459003101;
const real_t KP968583161 =
+0.968583161128631119490168375464735813836012403;
const real_t KP683113946 =
+0.683113946453479238701949862233725244439656928;
const real_t KP559154169 =
+0.559154169276087864842202529084232643714075927;
const real_t KP904730450 =
+0.904730450839922351881287709692877908104763647;
const real_t KP831864738 =
+0.831864738706457140726048799369896829771167132;
const real_t KP871714437 =
+0.871714437527667770979999223229522602943903653;
const real_t KP939062505 =
+0.939062505817492352556001843133229685779824606;
const real_t KP549754652 =
+0.549754652192770074288023275540779861653779767;
const real_t KP634619297 =
+0.634619297544148100711287640319130485732531031;
const real_t KP256756360 =
+0.256756360367726783319498520922669048172391148;
const real_t KP951056516 =
+0.951056516295153572116439333379382143405698634;
const real_t KP559016994 =
+0.559016994374947424102293417182819058860154590;
const real_t KP250000000 =
+0.250000000000000000000000000000000000000000000;
const real_t KP618033988 =
+0.618033988749894848204586834365638117720309180;
{
real_t T3Y, T3U, T3W, T42, T44, T3X, T3R, T3V, T3Z, T43;
{
real_t T4Q, T1U, T9, T3b, T45, T3e, T46, T1D, T4P, T1R, Ts,
T1K, T18, T1E, T4z;
real_t T5f, T3z, T22, T4s, T5b, T3C, T2o, T3D, T2h, T4p, T5c,
T4w, T5e, T3A, T29;
real_t T2z, T2y, TL, T1L, T1r, T1F, T4a, T57, T3v, T2x, T4k,
T55, T3s, T2T, T2D;
real_t T4c, T3t, T2M, T4h, T54, T1v, T1C, T1Q;
{
real_t T1, T2, T1w, T1x, T3, T5, T1z, T1A, T6;
T1v = v0.y;
T1 = v0.x;
T2 = v5.x;
T1w = v5.y;
T1x = v20.y;
T3 = v20.x;
T5 = v10.x;
T1z = v10.y;
T1A = v15.y;
T6 = v15.x;
{
real_t T3a, T3c, T1y, T39, T1B, T3d;
{
real_t T4, T1S, T7, T1T, T8;
T4 = T2 + T3;
T1S = T2 - T3;
T7 = T5 + T6;
T1T = T5 - T6;
T4Q = fma (-KP618033988, T1S, T1T);
T1U = fma (KP618033988, T1T, T1S);
T8 = T4 + T7;
T3a = T4 - T7;
T3c = T1w - T1x;
T1y = T1w + T1x;
T39 = fma (-KP250000000, T8, T1);
T9 = T1 + T8;
}
T1B = T1z + T1A;
T3d = T1z - T1A;
T3b = fma (KP559016994, T3a, T39);
T45 = fma (-KP559016994, T3a, T39);
T3e = fma (KP618033988, T3d, T3c);
T46 = fma (-KP618033988, T3c, T3d);
T1C = T1y + T1B;
T1Q = T1y - T1B;
}
}
{
real_t T24, T23, T28, T4v;
{
real_t TQ, Ta, TZ, Tj, T1Z, T20, Th, T26, T27, T1X,
TX, T2l, T2m, Tq, T2c;
real_t T2e, T12, T15, T2f, T1P, TT, TW;
TQ = v1.y;
Ta = v1.x;
T1P = fma (-KP250000000, T1C, T1v);
T1D = T1v + T1C;
TZ = v4.y;
Tj = v4.x;
T4P = fma (-KP559016994, T1Q, T1P);
T1R = fma (KP559016994, T1Q, T1P);
{
real_t TR, Tb, Tc, TS, TU, Te, Tf, TV, Td, Tg;
TR = v6.y;
Tb = v6.x;
Tc = v21.x;
TS = v21.y;
TU = v11.y;
Te = v11.x;
Tf = v16.x;
TV = v16.y;
T1Z = Tc - Tb;
Td = Tb + Tc;
T20 = Tf - Te;
Tg = Te + Tf;
Th = Td + Tg;
T24 = Td - Tg;
T26 = TR - TS;
TT = TR + TS;
TW = TU + TV;
T27 = TV - TU;
}
{
real_t T10, Tk, Tl, T11, T13, Tn, To, T14, Tm, Tp;
T10 = v9.y;
Tk = v9.x;
T1X = TT - TW;
TX = TT + TW;
Tl = v24.x;
T11 = v24.y;
T13 = v14.y;
Tn = v14.x;
To = v19.x;
T14 = v19.y;
T2l = Tl - Tk;
Tm = Tk + Tl;
T2m = To - Tn;
Tp = Tn + To;
Tq = Tm + Tp;
T2c = Tm - Tp;
T2e = T11 - T10;
T12 = T10 + T11;
T15 = T13 + T14;
T2f = T14 - T13;
}
{
real_t T2j, T2b, T1W, T21, T4y, T2i;
{
real_t Ti, T16, Tr, TY, T17;
T23 = fma (-KP250000000, Th, Ta);
Ti = Ta + Th;
T2j = T15 - T12;
T16 = T12 + T15;
Tr = Tj + Tq;
T2b = fma (KP250000000, Tq, -(Tj));
T1W = fma (-KP250000000, TX, TQ);
TY = TQ + TX;
T21 = fma (KP618033988, T20, T1Z);
T4y = fma (-KP618033988, T1Z, T20);
T2i = fma (-KP250000000, T16, TZ);
T17 = TZ + T16;
Ts = Ti + Tr;
T1K = Ti - Tr;
T18 = TY - T17;
T1E = TY + T17;
}
{
real_t T2n, T4r, T4x, T1Y;
T2n = fma (KP618033988, T2m, T2l);
T4r = fma (-KP618033988, T2l, T2m);
T4x = fma (-KP559016994, T1X, T1W);
T1Y = fma (KP559016994, T1X, T1W);
{
real_t T4o, T2g, T2d, T4n, T4q, T2k;
T4o = fma (-KP618033988, T2e, T2f);
T2g = fma (KP618033988, T2f, T2e);
T4z = fma (KP951056516, T4y, T4x);
T5f = fma (-KP951056516, T4y, T4x);
T3z = fma (-KP951056516, T21, T1Y);
T22 = fma (KP951056516, T21, T1Y);
T4q = fma (KP559016994, T2j, T2i);
T2k = fma (-KP559016994, T2j, T2i);
T4s = fma (KP951056516, T4r, T4q);
T5b = fma (-KP951056516, T4r, T4q);
T3C = fma (-KP951056516, T2n, T2k);
T2o = fma (KP951056516, T2n, T2k);
T2d = fma (-KP559016994, T2c, T2b);
T4n = fma (KP559016994, T2c, T2b);
T28 = fma (-KP618033988, T27, T26);
T4v = fma (KP618033988, T26, T27);
T3D = fma (-KP951056516, T2g, T2d);
T2h = fma (KP951056516, T2g, T2d);
T4p = fma (KP951056516, T4o, T4n);
T5c = fma (-KP951056516, T4o, T4n);
}
}
}
}
{
real_t T19, Tt, T1i, TC, T2u, T2v, TA, T2B, T2C, T2s,
T1g, T2J, T2K, TJ, T2O;
real_t T2Q, T1l, T1o, T2R;
{
real_t T4u, T25, T1c, T1f;
T19 = v2.y;
Tt = v2.x;
T1i = v3.y;
TC = v3.x;
T4u = fma (-KP559016994, T24, T23);
T25 = fma (KP559016994, T24, T23);
{
real_t T1a, Tu, Tv, T1b, T1d, Tx, Ty, T1e, Tw,
Tz;
T1a = v7.y;
Tu = v7.x;
T4w = fma (-KP951056516, T4v, T4u);
T5e = fma (KP951056516, T4v, T4u);
T3A = fma (-KP951056516, T28, T25);
T29 = fma (KP951056516, T28, T25);
Tv = v22.x;
T1b = v22.y;
T1d = v12.y;
Tx = v12.x;
Ty = v17.x;
T1e = v17.y;
T2u = Tv - Tu;
Tw = Tu + Tv;
T2v = Ty - Tx;
Tz = Tx + Ty;
TA = Tw + Tz;
T2z = Tz - Tw;
T2B = T1b - T1a;
T1c = T1a + T1b;
T1f = T1d + T1e;
T2C = T1d - T1e;
}
{
real_t T1j, TD, TE, T1k, T1m, TG, TH, T1n, TF,
TI;
T1j = v8.y;
TD = v8.x;
T2s = T1f - T1c;
T1g = T1c + T1f;
TE = v23.x;
T1k = v23.y;
T1m = v13.y;
TG = v13.x;
TH = v18.x;
T1n = v18.y;
T2J = TD - TE;
TF = TD + TE;
T2K = TG - TH;
TI = TG + TH;
TJ = TF + TI;
T2O = TI - TF;
T2Q = T1k - T1j;
T1l = T1j + T1k;
T1o = T1m + T1n;
T2R = T1n - T1m;
}
}
{
real_t T2H, T2N, T2r, T2w, T49, T2G;
{
real_t TB, T1p, TK, T1h, T1q;
T2y = fma (-KP250000000, TA, Tt);
TB = Tt + TA;
T2H = T1o - T1l;
T1p = T1l + T1o;
TK = TC + TJ;
T2N = fma (-KP250000000, TJ, TC);
T2r = fma (-KP250000000, T1g, T19);
T1h = T19 + T1g;
T2w = fma (KP618033988, T2v, T2u);
T49 = fma (-KP618033988, T2u, T2v);
T2G = fma (-KP250000000, T1p, T1i);
T1q = T1i + T1p;
TL = TB + TK;
T1L = TB - TK;
T1r = T1h - T1q;
T1F = T1h + T1q;
}
{
real_t T2S, T4j, T48, T2t;
T2S = fma (KP618033988, T2R, T2Q);
T4j = fma (-KP618033988, T2Q, T2R);
T48 = fma (KP559016994, T2s, T2r);
T2t = fma (-KP559016994, T2s, T2r);
{
real_t T4g, T2L, T2I, T4f, T4i, T2P;
T4g = fma (-KP618033988, T2J, T2K);
T2L = fma (KP618033988, T2K, T2J);
T4a = fma (KP951056516, T49, T48);
T57 = fma (-KP951056516, T49, T48);
T3v = fma (-KP951056516, T2w, T2t);
T2x = fma (KP951056516, T2w, T2t);
T4i = fma (KP559016994, T2O, T2N);
T2P = fma (-KP559016994, T2O, T2N);
T4k = fma (-KP951056516, T4j, T4i);
T55 = fma (KP951056516, T4j, T4i);
T3s = fma (KP951056516, T2S, T2P);
T2T = fma (-KP951056516, T2S, T2P);
T2I = fma (-KP559016994, T2H, T2G);
T4f = fma (KP559016994, T2H, T2G);
T2D = fma (-KP618033988, T2C, T2B);
T4c = fma (KP618033988, T2B, T2C);
T3t = fma (KP951056516, T2L, T2I);
T2M = fma (-KP951056516, T2L, T2I);
T4h = fma (-KP951056516, T4g, T4f);
T54 = fma (KP951056516, T4g, T4f);
}
}
}
}
}
{
real_t T4d, T58, T3w, T3H, T3r, T3k, T36, T38, T3o, T3q,
T3j, T2Z, T37;
{
real_t TM, T2E, T1s, T1u, TP, T1t;
{
real_t TO, TN, T4b, T2A;
TM = Ts + TL;
TO = Ts - TL;
T4b = fma (KP559016994, T2z, T2y);
T2A = fma (-KP559016994, T2z, T2y);
TN = fma (-KP250000000, TM, T9);
T4d = fma (KP951056516, T4c, T4b);
T58 = fma (-KP951056516, T4c, T4b);
T3w = fma (KP951056516, T2D, T2A);
T2E = fma (-KP951056516, T2D, T2A);
T1s = fma (KP618033988, T1r, T18);
T1u = fma (-KP618033988, T18, T1r);
TP = fma (KP559016994, TO, TN);
T1t = fma (-KP559016994, TO, TN);
}
{
real_t T1J, T1N, T1M, T1O, T1G, T1I, T1H;
T1G = T1E + T1F;
T1I = T1E - T1F;
T1H = fma (-KP250000000, T1G, T1D);
v0.y = T1D + T1G;
v0.x = T9 + TM;
T1J = fma (KP559016994, T1I, T1H);
T1N = fma (-KP559016994, T1I, T1H);
T1M = fma (KP618033988, T1L, T1K);
T1O = fma (-KP618033988, T1K, T1L);
{
real_t T1V, T3f, T3m, T3n, T2W, T2Y, T32, T3g,
T3h, T35, T3i, T2X;
T3H = fma (KP951056516, T1U, T1R);
T1V = fma (-KP951056516, T1U, T1R);
T3f = fma (KP951056516, T3e, T3b);
T3r = fma (-KP951056516, T3e, T3b);
v15.x = fma (KP951056516, T1u, T1t);
v15.y = fma (-KP951056516, T1O, T1N);
v10.y = fma (KP951056516, T1O, T1N);
v10.x = fma (-KP951056516, T1u, T1t);
v20.x = fma (-KP951056516, T1s, TP);
v20.y = fma (KP951056516, T1M, T1J);
v5.y = fma (-KP951056516, T1M, T1J);
v5.x = fma (KP951056516, T1s, TP);
{
real_t T30, T2a, T2p, T31, T33, T2F, T2U,
T34, T2q, T2V;
T30 = fma (KP256756360, T22, T29);
T2a = fma (-KP256756360, T29, T22);
T2p = fma (KP634619297, T2o, T2h);
T31 = fma (-KP634619297, T2h, T2o);
T33 = fma (KP549754652, T2x, T2E);
T2F = fma (-KP549754652, T2E, T2x);
T2U = fma (-KP939062505, T2T, T2M);
T34 = fma (KP939062505, T2M, T2T);
T3m = fma (-KP871714437, T2p, T2a);
T2q = fma (KP871714437, T2p, T2a);
T3n = fma (-KP831864738, T2U, T2F);
T2V = fma (KP831864738, T2U, T2F);
T2W = fma (KP904730450, T2V, T2q);
T2Y = fma (-KP904730450, T2V, T2q);
T32 = fma (-KP871714437, T31, T30);
T3g = fma (KP871714437, T31, T30);
T3h = fma (KP831864738, T34, T33);
T35 = fma (-KP831864738, T34, T33);
}
T3i = fma (KP904730450, T3h, T3g);
T3k = fma (-KP904730450, T3h, T3g);
T36 = fma (KP559154169, T35, T32);
T38 = fma (-KP683113946, T32, T35);
v1.x = fma (KP968583161, T3i, T3f);
v1.y = fma (KP968583161, T2W, T1V);
T2X = fma (-KP242145790, T2W, T1V);
T3o = fma (KP559154169, T3n, T3m);
T3q = fma (-KP683113946, T3m, T3n);
T3j = fma (-KP242145790, T3i, T3f);
T2Z = fma (KP541454447, T2Y, T2X);
T37 = fma (-KP541454447, T2Y, T2X);
}
}
}
{
real_t T47, T4R, T5A, T5w, T5y, T5E, T5G, T5z, T5t,
T5x;
{
real_t T53, T5j, T5u, T5v, T5i, T5D, T5m, T5p,
T5C, T3p, T3l, T5s, T5q, T5r;
T47 = fma (KP951056516, T46, T45);
T53 = fma (-KP951056516, T46, T45);
T3p = fma (-KP541454447, T3k, T3j);
T3l = fma (KP541454447, T3k, T3j);
v11.y = fma (KP833417178, T38, T37);
v11.x = fma (-KP833417178, T3q, T3p);
v16.x = fma (KP833417178, T3q, T3p);
v16.y = fma (-KP833417178, T38, T37);
v21.y = fma (KP921177326, T36, T2Z);
v21.x = fma (-KP921177326, T3o, T3l);
v6.x = fma (KP921177326, T3o, T3l);
v6.y = fma (-KP921177326, T36, T2Z);
T5j = fma (KP951056516, T4Q, T4P);
T4R = fma (-KP951056516, T4Q, T4P);
{
real_t T5k, T56, T59, T5l, T5n, T5d, T5g, T5o,
T5a, T5h;
T5k = fma (-KP062914667, T54, T55);
T56 = fma (KP062914667, T55, T54);
T59 = fma (KP634619297, T58, T57);
T5l = fma (-KP634619297, T57, T58);
T5n = fma (-KP470564281, T5b, T5c);
T5d = fma (KP470564281, T5c, T5b);
T5g = fma (KP549754652, T5f, T5e);
T5o = fma (-KP549754652, T5e, T5f);
T5u = fma (-KP845997307, T59, T56);
T5a = fma (KP845997307, T59, T56);
T5v = fma (-KP968479752, T5g, T5d);
T5h = fma (KP968479752, T5g, T5d);
T5i = fma (KP906616052, T5h, T5a);
T5A = fma (-KP906616052, T5h, T5a);
T5D = fma (-KP845997307, T5l, T5k);
T5m = fma (KP845997307, T5l, T5k);
T5p = fma (KP968479752, T5o, T5n);
T5C = fma (-KP968479752, T5o, T5n);
}
T5s = fma (KP906616052, T5p, T5m);
T5q = fma (-KP906616052, T5p, T5m);
T5w = fma (-KP560319534, T5v, T5u);
T5y = fma (KP681693190, T5u, T5v);
T5E = fma (-KP681693190, T5D, T5C);
T5G = fma (KP560319534, T5C, T5D);
T5r = fma (KP249506682, T5q, T5j);
v2.x = fma (KP998026728, T5i, T53);
v2.y = fma (-KP998026728, T5q, T5j);
T5z = fma (-KP249506682, T5i, T53);
T5t = fma (-KP557913902, T5s, T5r);
T5x = fma (KP557913902, T5s, T5r);
}
{
real_t T4W, T4M, T4O, T50, T52, T4V, T4F, T4N;
{
real_t T4Y, T4Z, T4C, T4E, T4I, T4T, T4S, T4L,
T5F, T5B, T4U, T4D;
T5F = fma (KP557913902, T5A, T5z);
T5B = fma (-KP557913902, T5A, T5z);
v12.y = fma (-KP949179823, T5w, T5t);
v12.x = fma (-KP949179823, T5G, T5F);
v17.x = fma (KP949179823, T5G, T5F);
v17.y = fma (KP949179823, T5w, T5t);
v7.y = fma (KP860541664, T5y, T5x);
v7.x = fma (-KP860541664, T5E, T5B);
v22.x = fma (KP860541664, T5E, T5B);
v22.y = fma (-KP860541664, T5y, T5x);
{
real_t T4J, T4e, T4l, T4K, T4G, T4t, T4A,
T4H, T4m, T4B;
T4J = fma (-KP062914667, T4a, T4d);
T4e = fma (KP062914667, T4d, T4a);
T4l = fma (-KP827271945, T4k, T4h);
T4K = fma (KP827271945, T4h, T4k);
T4G = fma (-KP126329378, T4p, T4s);
T4t = fma (KP126329378, T4s, T4p);
T4A = fma (KP939062505, T4z, T4w);
T4H = fma (-KP939062505, T4w, T4z);
T4Y = fma (-KP772036680, T4l, T4e);
T4m = fma (KP772036680, T4l, T4e);
T4Z = fma (-KP734762448, T4A, T4t);
T4B = fma (KP734762448, T4A, T4t);
T4C = fma (KP994076283, T4B, T4m);
T4E = fma (-KP994076283, T4B, T4m);
T4I = fma (KP734762448, T4H, T4G);
T4T = fma (-KP734762448, T4H, T4G);
T4S = fma (KP772036680, T4K, T4J);
T4L = fma (-KP772036680, T4K, T4J);
}
T4U = fma (KP994076283, T4T, T4S);
T4W = fma (-KP994076283, T4T, T4S);
T4M = fma (-KP621716863, T4L, T4I);
T4O = fma (KP614372930, T4I, T4L);
v3.y = fma (-KP998026728, T4U, T4R);
v3.x = fma (KP998026728, T4C, T47);
T4D = fma (-KP249506682, T4C, T47);
T50 = fma (KP614372930, T4Z, T4Y);
T52 = fma (-KP621716863, T4Y, T4Z);
T4V = fma (KP249506682, T4U, T4R);
T4F = fma (-KP557913902, T4E, T4D);
T4N = fma (KP557913902, T4E, T4D);
}
{
real_t T3S, T3T, T3G, T41, T3K, T3N, T40, T51,
T4X, T3Q, T3O, T3P;
T51 = fma (KP557913902, T4W, T4V);
T4X = fma (-KP557913902, T4W, T4V);
v8.x = fma (KP943557151, T4M, T4F);
v8.y = fma (KP943557151, T52, T51);
v23.y = fma (-KP943557151, T52, T51);
v23.x = fma (-KP943557151, T4M, T4F);
v18.x = fma (-KP949179823, T4O, T4N);
v18.y = fma (-KP949179823, T50, T4X);
v13.y = fma (KP949179823, T50, T4X);
v13.x = fma (KP949179823, T4O, T4N);
{
real_t T3I, T3u, T3x, T3J, T3L, T3B, T3E,
T3M, T3y, T3F;
T3I = fma (KP126329378, T3s, T3t);
T3u = fma (-KP126329378, T3t, T3s);
T3x = fma (-KP470564281, T3w, T3v);
T3J = fma (KP470564281, T3v, T3w);
T3L = fma (-KP634619297, T3z, T3A);
T3B = fma (KP634619297, T3A, T3z);
T3E = fma (-KP827271945, T3D, T3C);
T3M = fma (KP827271945, T3C, T3D);
T3S = fma (KP912018591, T3x, T3u);
T3y = fma (-KP912018591, T3x, T3u);
T3T = fma (KP912575812, T3E, T3B);
T3F = fma (-KP912575812, T3E, T3B);
T3G = fma (-KP851038619, T3F, T3y);
T3Y = fma (KP851038619, T3F, T3y);
T41 = fma (-KP912018591, T3J, T3I);
T3K = fma (KP912018591, T3J, T3I);
T3N = fma (KP912575812, T3M, T3L);
T40 = fma (-KP912575812, T3M, T3L);
}
T3Q = fma (-KP851038619, T3N, T3K);
T3O = fma (KP851038619, T3N, T3K);
T3U = fma (-KP525970792, T3T, T3S);
T3W = fma (KP726211448, T3S, T3T);
T42 = fma (-KP726211448, T41, T40);
T44 = fma (KP525970792, T40, T41);
T3P = fma (KP248028675, T3O, T3H);
v4.x = fma (-KP992114701, T3G, T3r);
v4.y = fma (-KP992114701, T3O, T3H);
T3X = fma (KP248028675, T3G, T3r);
T3R = fma (-KP554608978, T3Q, T3P);
T3V = fma (KP554608978, T3Q, T3P);
}
}
}
}
}
T3Z = fma (KP554608978, T3Y, T3X);
T43 = fma (-KP554608978, T3Y, T3X);
v14.y = fma (KP943557151, T3U, T3R);
v14.x = fma (-KP943557151, T44, T43);
v19.x = fma (KP943557151, T44, T43);
v19.y = fma (-KP943557151, T3U, T3R);
v24.y = fma (KP803003575, T3W, T3V);
v24.x = fma (KP803003575, T42, T3Z);
v9.x = fma (-KP803003575, T42, T3Z);
v9.y = fma (-KP803003575, T3W, T3V);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
*u11 = v11;
*u12 = v12;
*u13 = v13;
*u14 = v14;
*u15 = v15;
*u16 = v16;
*u17 = v17;
*u18 = v18;
*u19 = v19;
*u20 = v20;
*u21 = v21;
*u22 = v22;
*u23 = v23;
*u24 = v24;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 25;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
double2 v9 = x[9 * threads];
double2 v10 = x[10 * threads];
double2 v11 = x[11 * threads];
double2 v12 = x[12 * threads];
double2 v13 = x[13 * threads];
double2 v14 = x[14 * threads];
double2 v15 = x[15 * threads];
double2 v16 = x[16 * threads];
double2 v17 = x[17 * threads];
double2 v18 = x[18 * threads];
double2 v19 = x[19 * threads];
double2 v20 = x[20 * threads];
double2 v21 = x[21 * threads];
double2 v22 = x[22 * threads];
double2 v23 = x[23 * threads];
double2 v24 = x[24 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.2513274122871835 * k / p));
v2 = mul(v2, twiddle((double)-0.5026548245743669 * k / p));
v3 = mul(v3, twiddle((double)-0.7539822368615503 * k / p));
v4 = mul(v4, twiddle((double)-1.005309649148734 * k / p));
v5 = mul(v5, twiddle((double)-1.256637061435917 * k / p));
v6 = mul(v6, twiddle((double)-1.507964473723101 * k / p));
v7 = mul(v7, twiddle((double)-1.759291886010284 * k / p));
v8 = mul(v8, twiddle((double)-2.010619298297468 * k / p));
v9 = mul(v9, twiddle((double)-2.261946710584651 * k / p));
v10 = mul(v10, twiddle((double)-2.513274122871834 * k / p));
v11 = mul(v11, twiddle((double)-2.764601535159018 * k / p));
v12 = mul(v12, twiddle((double)-3.015928947446201 * k / p));
v13 = mul(v13, twiddle((double)-3.267256359733385 * k / p));
v14 = mul(v14, twiddle((double)-3.518583772020568 * k / p));
v15 = mul(v15, twiddle((double)-3.769911184307751 * k / p));
v16 = mul(v16, twiddle((double)-4.021238596594936 * k / p));
v17 = mul(v17, twiddle((double)-4.272566008882119 * k / p));
v18 = mul(v18, twiddle((double)-4.523893421169302 * k / p));
v19 = mul(v19, twiddle((double)-4.775220833456485 * k / p));
v20 = mul(v20, twiddle((double)-5.026548245743669 * k / p));
v21 = mul(v21, twiddle((double)-5.277875658030853 * k / p));
v22 = mul(v22, twiddle((double)-5.529203070318036 * k / p));
v23 = mul(v23, twiddle((double)-5.780530482605219 * k / p));
v24 = mul(v24, twiddle((double)-6.031857894892402 * k / p));
}
dft25(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15, &v16, &v17, &v18, &v19, &v20, &v21, &v22, &v23, &v24);
const size_t j = k + (i - k) * 25;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
y[11 * p] = v11;
y[12 * p] = v12;
y[13 * p] = v13;
y[14 * p] = v14;
y[15 * p] = v15;
y[16 * p] = v16;
y[17 * p] = v17;
y[18 * p] = v18;
y[19 * p] = v19;
y[20 * p] = v20;
y[21 * p] = v21;
y[22 * p] = v22;
y[23 * p] = v23;
y[24 * p] = v24;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 16 FP additions, 0 FP multiplications,
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
* 16 stack variables, 0 constants, and 16 memory accesses
*/
DEVICE void
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
{
{
real_t Tc, T4, Tb, T3, Tf, Ta, T5, Td;
{
real_t T8, T1, T2, T9;
T8 = v0.y;
T1 = v0.x;
T2 = v2.x;
T9 = v2.y;
Tc = v1.y;
T4 = v1.x;
Tb = T1 - T2;
T3 = T1 + T2;
Tf = T8 + T9;
Ta = T8 - T9;
T5 = v3.x;
Td = v3.y;
}
{
real_t T6, T7, Te, Tg;
T6 = T4 + T5;
T7 = T4 - T5;
Te = Tc - Td;
Tg = Tc + Td;
v0.x = T3 + T6;
v0.y = Tf + Tg;
v2.y = Tf - Tg;
v2.x = T3 - T6;
v3.y = Ta - T7;
v3.x = Tb + Te;
v1.x = Tb - Te;
v1.y = T7 + Ta;
}
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 4;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p));
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p));
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p));
}
dft4(&v0, &v1, &v2, &v3);
const size_t j = k + (i - k) * 4;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 25 -name dft25 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 352 FP additions, 268 FP multiplications,
* (or, 84 additions, 0 multiplications, 268 fused multiply/add),
* 188 stack variables, 47 constants, and 100 memory accesses
*/
DEVICE void
dft25 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9,
real2_t * u10, real2_t * u11, real2_t * u12, real2_t * u13,
real2_t * u14, real2_t * u15, real2_t * u16, real2_t * u17,
real2_t * u18, real2_t * u19, real2_t * u20, real2_t * u21,
real2_t * u22, real2_t * u23, real2_t * u24)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
real2_t v9 = *u9;
real2_t v10 = *u10;
real2_t v11 = *u11;
real2_t v12 = *u12;
real2_t v13 = *u13;
real2_t v14 = *u14;
real2_t v15 = *u15;
real2_t v16 = *u16;
real2_t v17 = *u17;
real2_t v18 = *u18;
real2_t v19 = *u19;
real2_t v20 = *u20;
real2_t v21 = *u21;
real2_t v22 = *u22;
real2_t v23 = *u23;
real2_t v24 = *u24;
{
const real_t KP803003575 =
+0.803003575438660414833440593570376004635464850;
const real_t KP554608978 =
+0.554608978404018097464974850792216217022558774;
const real_t KP992114701 =
+0.992114701314477831049793042785778521453036709;
const real_t KP248028675 =
+0.248028675328619457762448260696444630363259177;
const real_t KP726211448 =
+0.726211448929902658173535992263577167607493062;
const real_t KP525970792 =
+0.525970792408939708442463226536226366643874659;
const real_t KP851038619 =
+0.851038619207379630836264138867114231259902550;
const real_t KP912575812 =
+0.912575812670962425556968549836277086778922727;
const real_t KP912018591 =
+0.912018591466481957908415381764119056233607330;
const real_t KP860541664 =
+0.860541664367944677098261680920518816412804187;
const real_t KP560319534 =
+0.560319534973832390111614715371676131169633784;
const real_t KP681693190 =
+0.681693190061530575150324149145440022633095390;
const real_t KP906616052 =
+0.906616052148196230441134447086066874408359177;
const real_t KP968479752 =
+0.968479752739016373193524836781420152702090879;
const real_t KP845997307 =
+0.845997307939530944175097360758058292389769300;
const real_t KP470564281 =
+0.470564281212251493087595091036643380879947982;
const real_t KP949179823 =
+0.949179823508441261575555465843363271711583843;
const real_t KP943557151 =
+0.943557151597354104399655195398983005179443399;
const real_t KP557913902 =
+0.557913902031834264187699648465567037992437152;
const real_t KP249506682 =
+0.249506682107067890488084201715862638334226305;
const real_t KP998026728 =
+0.998026728428271561952336806863450553336905220;
const real_t KP614372930 =
+0.614372930789563808870829930444362096004872855;
const real_t KP621716863 =
+0.621716863012209892444754556304102309693593202;
const real_t KP994076283 =
+0.994076283785401014123185814696322018529298887;
const real_t KP734762448 =
+0.734762448793050413546343770063151342619912334;
const real_t KP772036680 =
+0.772036680810363904029489473607579825330539880;
const real_t KP126329378 =
+0.126329378446108174786050455341811215027378105;
const real_t KP827271945 =
+0.827271945972475634034355757144307982555673741;
const real_t KP062914667 =
+0.062914667253649757225485955897349402364686947;
const real_t KP921177326 =
+0.921177326965143320250447435415066029359282231;
const real_t KP833417178 =
+0.833417178328688677408962550243238843138996060;
const real_t KP541454447 =
+0.541454447536312777046285590082819509052033189;
const real_t KP242145790 =
+0.242145790282157779872542093866183953459003101;
const real_t KP968583161 =
+0.968583161128631119490168375464735813836012403;
const real_t KP683113946 =
+0.683113946453479238701949862233725244439656928;
const real_t KP559154169 =
+0.559154169276087864842202529084232643714075927;
const real_t KP904730450 =
+0.904730450839922351881287709692877908104763647;
const real_t KP831864738 =
+0.831864738706457140726048799369896829771167132;
const real_t KP871714437 =
+0.871714437527667770979999223229522602943903653;
const real_t KP939062505 =
+0.939062505817492352556001843133229685779824606;
const real_t KP549754652 =
+0.549754652192770074288023275540779861653779767;
const real_t KP634619297 =
+0.634619297544148100711287640319130485732531031;
const real_t KP256756360 =
+0.256756360367726783319498520922669048172391148;
const real_t KP951056516 =
+0.951056516295153572116439333379382143405698634;
const real_t KP559016994 =
+0.559016994374947424102293417182819058860154590;
const real_t KP250000000 =
+0.250000000000000000000000000000000000000000000;
const real_t KP618033988 =
+0.618033988749894848204586834365638117720309180;
{
real_t T3Y, T3U, T3W, T42, T44, T3X, T3R, T3V, T3Z, T43;
{
real_t T4E, T3e, T9, T1R, T45, T1U, T46, T1D, T4D, T3b, Ts,
T1K, T18, T1E, T4w;
real_t T5f, T3A, T22, T4s, T5c, T3D, T2o, T3C, T2h, T4p, T5b,
T4z, T5e, T3z, T29;
real_t T2z, T2y, TL, T1L, T1r, T1F, T4k, T54, T3w, T2x, T4a,
T58, T3t, T2T, T2D;
real_t T4g, T3s, T2M, T4d, T57, T1v, T1C, T3a;
{
real_t T1, T2, T1w, T1x, T3, T5, T1z, T1A, T6;
T1v = v0.y;
T1 = v0.x;
T2 = v5.x;
T1w = v5.y;
T1x = v20.y;
T3 = v20.x;
T5 = v10.x;
T1z = v10.y;
T1A = v15.y;
T6 = v15.x;
{
real_t T1Q, T1S, T1y, T1P, T1B, T1T;
{
real_t T4, T3c, T7, T3d, T8;
T4 = T2 + T3;
T3c = T2 - T3;
T7 = T5 + T6;
T3d = T5 - T6;
T4E = fma (-KP618033988, T3c, T3d);
T3e = fma (KP618033988, T3d, T3c);
T8 = T4 + T7;
T1Q = T4 - T7;
T1S = T1w - T1x;
T1y = T1w + T1x;
T1P = fma (-KP250000000, T8, T1);
T9 = T1 + T8;
}
T1B = T1z + T1A;
T1T = T1z - T1A;
T1R = fma (KP559016994, T1Q, T1P);
T45 = fma (-KP559016994, T1Q, T1P);
T1U = fma (KP618033988, T1T, T1S);
T46 = fma (-KP618033988, T1S, T1T);
T1C = T1y + T1B;
T3a = T1y - T1B;
}
}
{
real_t T24, T23, T28, T4y;
{
real_t TQ, Ta, TZ, Tj, T26, T27, Th, T1X, T1Z, T20,
TX, T2e, T2f, Tq, T2j;
real_t T2l, T12, T15, T2m, T39, TT, TW;
TQ = v1.y;
Ta = v1.x;
T39 = fma (-KP250000000, T1C, T1v);
T1D = T1v + T1C;
TZ = v4.y;
Tj = v4.x;
T4D = fma (-KP559016994, T3a, T39);
T3b = fma (KP559016994, T3a, T39);
{
real_t TR, Tb, Tc, TS, TU, Te, Tf, TV, Td, Tg;
TR = v6.y;
Tb = v6.x;
Tc = v21.x;
TS = v21.y;
TU = v11.y;
Te = v11.x;
Tf = v16.x;
TV = v16.y;
T26 = Tb - Tc;
Td = Tb + Tc;
T27 = Tf - Te;
Tg = Te + Tf;
Th = Td + Tg;
T1X = Td - Tg;
T1Z = TS - TR;
TT = TR + TS;
TW = TU + TV;
T20 = TV - TU;
}
{
real_t T10, Tk, Tl, T11, T13, Tn, To, T14, Tm, Tp;
T10 = v9.y;
Tk = v9.x;
T24 = TT - TW;
TX = TT + TW;
Tl = v24.x;
T11 = v24.y;
T13 = v14.y;
Tn = v14.x;
To = v19.x;
T14 = v19.y;
T2e = Tl - Tk;
Tm = Tk + Tl;
T2f = To - Tn;
Tp = Tn + To;
Tq = Tm + Tp;
T2j = Tp - Tm;
T2l = T11 - T10;
T12 = T10 + T11;
T15 = T13 + T14;
T2m = T14 - T13;
}
{
real_t T1W, T2c, T2i, T21, T4v, T2b;
{
real_t Ti, T16, Tr, TY, T17;
T1W = fma (-KP250000000, Th, Ta);
Ti = Ta + Th;
T2c = T12 - T15;
T16 = T12 + T15;
Tr = Tj + Tq;
T2i = fma (-KP250000000, Tq, Tj);
T23 = fma (-KP250000000, TX, TQ);
TY = TQ + TX;
T21 = fma (KP618033988, T20, T1Z);
T4v = fma (-KP618033988, T1Z, T20);
T2b = fma (KP250000000, T16, -(TZ));
T17 = TZ + T16;
Ts = Ti + Tr;
T1K = Ti - Tr;
T18 = TY - T17;
T1E = TY + T17;
}
{
real_t T2n, T4r, T4u, T1Y;
T2n = fma (KP618033988, T2m, T2l);
T4r = fma (-KP618033988, T2l, T2m);
T4u = fma (-KP559016994, T1X, T1W);
T1Y = fma (KP559016994, T1X, T1W);
{
real_t T4o, T2g, T2d, T4n, T4q, T2k;
T4o = fma (-KP618033988, T2e, T2f);
T2g = fma (KP618033988, T2f, T2e);
T4w = fma (-KP951056516, T4v, T4u);
T5f = fma (KP951056516, T4v, T4u);
T3A = fma (-KP951056516, T21, T1Y);
T22 = fma (KP951056516, T21, T1Y);
T4q = fma (KP559016994, T2j, T2i);
T2k = fma (-KP559016994, T2j, T2i);
T4s = fma (-KP951056516, T4r, T4q);
T5c = fma (KP951056516, T4r, T4q);
T3D = fma (-KP951056516, T2n, T2k);
T2o = fma (KP951056516, T2n, T2k);
T2d = fma (-KP559016994, T2c, T2b);
T4n = fma (KP559016994, T2c, T2b);
T28 = fma (-KP618033988, T27, T26);
T4y = fma (KP618033988, T26, T27);
T3C = fma (-KP951056516, T2g, T2d);
T2h = fma (KP951056516, T2g, T2d);
T4p = fma (-KP951056516, T4o, T4n);
T5b = fma (KP951056516, T4o, T4n);
}
}
}
}
{
real_t T19, Tt, T1i, TC, T2B, T2C, TA, T2s, T2u, T2v,
T1g, T2Q, T2R, TJ, T2H;
real_t T2J, T1l, T1o, T2K;
{
real_t T4x, T25, T1c, T1f;
T19 = v2.y;
Tt = v2.x;
T1i = v3.y;
TC = v3.x;
T4x = fma (-KP559016994, T24, T23);
T25 = fma (KP559016994, T24, T23);
{
real_t T1a, Tu, Tv, T1b, T1d, Tx, Ty, T1e, Tw,
Tz;
T1a = v7.y;
Tu = v7.x;
T4z = fma (KP951056516, T4y, T4x);
T5e = fma (-KP951056516, T4y, T4x);
T3z = fma (-KP951056516, T28, T25);
T29 = fma (KP951056516, T28, T25);
Tv = v22.x;
T1b = v22.y;
T1d = v12.y;
Tx = v12.x;
Ty = v17.x;
T1e = v17.y;
T2B = Tv - Tu;
Tw = Tu + Tv;
T2C = Tx - Ty;
Tz = Tx + Ty;
TA = Tw + Tz;
T2s = Tz - Tw;
T2u = T1b - T1a;
T1c = T1a + T1b;
T1f = T1d + T1e;
T2v = T1e - T1d;
}
{
real_t T1j, TD, TE, T1k, T1m, TG, TH, T1n, TF,
TI;
T1j = v8.y;
TD = v8.x;
T2z = T1f - T1c;
T1g = T1c + T1f;
TE = v23.x;
T1k = v23.y;
T1m = v13.y;
TG = v13.x;
TH = v18.x;
T1n = v18.y;
T2Q = TE - TD;
TF = TD + TE;
T2R = TH - TG;
TI = TG + TH;
TJ = TF + TI;
T2H = TI - TF;
T2J = T1j - T1k;
T1l = T1j + T1k;
T1o = T1m + T1n;
T2K = T1m - T1n;
}
}
{
real_t T2r, T2O, T2G, T2w, T4j, T2N;
{
real_t TB, T1p, TK, T1h, T1q;
T2r = fma (-KP250000000, TA, Tt);
TB = Tt + TA;
T2O = T1o - T1l;
T1p = T1l + T1o;
TK = TC + TJ;
T2G = fma (-KP250000000, TJ, TC);
T2y = fma (-KP250000000, T1g, T19);
T1h = T19 + T1g;
T2w = fma (KP618033988, T2v, T2u);
T4j = fma (-KP618033988, T2u, T2v);
T2N = fma (-KP250000000, T1p, T1i);
T1q = T1i + T1p;
TL = TB + TK;
T1L = TB - TK;
T1r = T1h - T1q;
T1F = T1h + T1q;
}
{
real_t T2S, T49, T4i, T2t;
T2S = fma (KP618033988, T2R, T2Q);
T49 = fma (-KP618033988, T2Q, T2R);
T4i = fma (KP559016994, T2s, T2r);
T2t = fma (-KP559016994, T2s, T2r);
{
real_t T4c, T2L, T2I, T4b, T48, T2P;
T4c = fma (-KP618033988, T2J, T2K);
T2L = fma (KP618033988, T2K, T2J);
T4k = fma (-KP951056516, T4j, T4i);
T54 = fma (KP951056516, T4j, T4i);
T3w = fma (-KP951056516, T2w, T2t);
T2x = fma (KP951056516, T2w, T2t);
T48 = fma (KP559016994, T2O, T2N);
T2P = fma (-KP559016994, T2O, T2N);
T4a = fma (KP951056516, T49, T48);
T58 = fma (-KP951056516, T49, T48);
T3t = fma (KP951056516, T2S, T2P);
T2T = fma (-KP951056516, T2S, T2P);
T2I = fma (-KP559016994, T2H, T2G);
T4b = fma (KP559016994, T2H, T2G);
T2D = fma (-KP618033988, T2C, T2B);
T4g = fma (KP618033988, T2B, T2C);
T3s = fma (KP951056516, T2L, T2I);
T2M = fma (-KP951056516, T2L, T2I);
T4d = fma (KP951056516, T4c, T4b);
T57 = fma (-KP951056516, T4c, T4b);
}
}
}
}
}
{
real_t T4h, T55, T3v, T3r, T3H, T3k, T36, T38, T3o, T3q,
T3j, T2Z, T37;
{
real_t TM, T2E, T1s, T1u, TP, T1t;
{
real_t TO, TN, T4f, T2A;
TM = Ts + TL;
TO = Ts - TL;
T4f = fma (KP559016994, T2z, T2y);
T2A = fma (-KP559016994, T2z, T2y);
TN = fma (-KP250000000, TM, T9);
T4h = fma (-KP951056516, T4g, T4f);
T55 = fma (KP951056516, T4g, T4f);
T3v = fma (KP951056516, T2D, T2A);
T2E = fma (-KP951056516, T2D, T2A);
T1s = fma (KP618033988, T1r, T18);
T1u = fma (-KP618033988, T18, T1r);
TP = fma (KP559016994, TO, TN);
T1t = fma (-KP559016994, TO, TN);
}
{
real_t T1J, T1N, T1M, T1O, T1G, T1I, T1H;
T1G = T1E + T1F;
T1I = T1E - T1F;
T1H = fma (-KP250000000, T1G, T1D);
v0.y = T1D + T1G;
v0.x = T9 + TM;
T1J = fma (KP559016994, T1I, T1H);
T1N = fma (-KP559016994, T1I, T1H);
T1M = fma (KP618033988, T1L, T1K);
T1O = fma (-KP618033988, T1K, T1L);
{
real_t T1V, T3f, T3m, T3n, T2W, T2Y, T32, T3g,
T3h, T35, T3i, T2X;
T3r = fma (KP951056516, T1U, T1R);
T1V = fma (-KP951056516, T1U, T1R);
T3f = fma (KP951056516, T3e, T3b);
T3H = fma (-KP951056516, T3e, T3b);
v15.x = fma (-KP951056516, T1u, T1t);
v15.y = fma (KP951056516, T1O, T1N);
v10.y = fma (-KP951056516, T1O, T1N);
v10.x = fma (KP951056516, T1u, T1t);
v20.x = fma (KP951056516, T1s, TP);
v20.y = fma (-KP951056516, T1M, T1J);
v5.y = fma (KP951056516, T1M, T1J);
v5.x = fma (-KP951056516, T1s, TP);
{
real_t T30, T2a, T2p, T31, T33, T2F, T2U,
T34, T2q, T2V;
T30 = fma (KP256756360, T22, T29);
T2a = fma (-KP256756360, T29, T22);
T2p = fma (KP634619297, T2o, T2h);
T31 = fma (-KP634619297, T2h, T2o);
T33 = fma (KP549754652, T2x, T2E);
T2F = fma (-KP549754652, T2E, T2x);
T2U = fma (-KP939062505, T2T, T2M);
T34 = fma (KP939062505, T2M, T2T);
T3m = fma (-KP871714437, T2p, T2a);
T2q = fma (KP871714437, T2p, T2a);
T3n = fma (-KP831864738, T2U, T2F);
T2V = fma (KP831864738, T2U, T2F);
T2W = fma (KP904730450, T2V, T2q);
T2Y = fma (-KP904730450, T2V, T2q);
T32 = fma (-KP871714437, T31, T30);
T3g = fma (KP871714437, T31, T30);
T3h = fma (KP831864738, T34, T33);
T35 = fma (-KP831864738, T34, T33);
}
T3i = fma (KP904730450, T3h, T3g);
T3k = fma (-KP904730450, T3h, T3g);
T36 = fma (KP559154169, T35, T32);
T38 = fma (-KP683113946, T32, T35);
v1.y = fma (KP968583161, T3i, T3f);
v1.x = fma (KP968583161, T2W, T1V);
T2X = fma (-KP242145790, T2W, T1V);
T3o = fma (KP559154169, T3n, T3m);
T3q = fma (-KP683113946, T3m, T3n);
T3j = fma (-KP242145790, T3i, T3f);
T2Z = fma (KP541454447, T2Y, T2X);
T37 = fma (-KP541454447, T2Y, T2X);
}
}
}
{
real_t T4F, T47, T5A, T5s, T5u, T5E, T5G, T5z, T5l,
T5t;
{
real_t T53, T5v, T5C, T5D, T5i, T5k, T5o, T5x,
T5w, T5r, T3p, T3l, T5y, T5j;
T4F = fma (-KP951056516, T4E, T4D);
T53 = fma (KP951056516, T4E, T4D);
T3p = fma (-KP541454447, T3k, T3j);
T3l = fma (KP541454447, T3k, T3j);
v11.x = fma (KP833417178, T38, T37);
v11.y = fma (-KP833417178, T3q, T3p);
v16.y = fma (KP833417178, T3q, T3p);
v16.x = fma (-KP833417178, T38, T37);
v21.x = fma (KP921177326, T36, T2Z);
v21.y = fma (-KP921177326, T3o, T3l);
v6.y = fma (KP921177326, T3o, T3l);
v6.x = fma (-KP921177326, T36, T2Z);
T5v = fma (-KP951056516, T46, T45);
T47 = fma (KP951056516, T46, T45);
{
real_t T5p, T56, T59, T5q, T5m, T5d, T5g, T5n,
T5a, T5h;
T5p = fma (-KP062914667, T54, T55);
T56 = fma (KP062914667, T55, T54);
T59 = fma (-KP827271945, T58, T57);
T5q = fma (KP827271945, T57, T58);
T5m = fma (-KP126329378, T5b, T5c);
T5d = fma (KP126329378, T5c, T5b);
T5g = fma (KP939062505, T5f, T5e);
T5n = fma (-KP939062505, T5e, T5f);
T5C = fma (-KP772036680, T59, T56);
T5a = fma (KP772036680, T59, T56);
T5D = fma (-KP734762448, T5g, T5d);
T5h = fma (KP734762448, T5g, T5d);
T5i = fma (KP994076283, T5h, T5a);
T5k = fma (-KP994076283, T5h, T5a);
T5o = fma (KP734762448, T5n, T5m);
T5x = fma (-KP734762448, T5n, T5m);
T5w = fma (KP772036680, T5q, T5p);
T5r = fma (-KP772036680, T5q, T5p);
}
T5y = fma (KP994076283, T5x, T5w);
T5A = fma (-KP994076283, T5x, T5w);
T5s = fma (-KP621716863, T5r, T5o);
T5u = fma (KP614372930, T5o, T5r);
v3.x = fma (-KP998026728, T5y, T5v);
v3.y = fma (KP998026728, T5i, T53);
T5j = fma (-KP249506682, T5i, T53);
T5E = fma (KP614372930, T5D, T5C);
T5G = fma (-KP621716863, T5C, T5D);
T5z = fma (KP249506682, T5y, T5v);
T5l = fma (-KP557913902, T5k, T5j);
T5t = fma (KP557913902, T5k, T5j);
}
{
real_t T4W, T4S, T4U, T50, T52, T4V, T4P, T4T;
{
real_t T4R, T4Q, T4C, T4Z, T4I, T4L, T4Y, T5F,
T5B, T4O, T4M, T4N;
T5F = fma (KP557913902, T5A, T5z);
T5B = fma (-KP557913902, T5A, T5z);
v8.y = fma (KP943557151, T5s, T5l);
v8.x = fma (KP943557151, T5G, T5F);
v23.x = fma (-KP943557151, T5G, T5F);
v23.y = fma (-KP943557151, T5s, T5l);
v18.y = fma (-KP949179823, T5u, T5t);
v18.x = fma (-KP949179823, T5E, T5B);
v13.x = fma (KP949179823, T5E, T5B);
v13.y = fma (KP949179823, T5u, T5t);
{
real_t T4G, T4e, T4l, T4H, T4J, T4t, T4A,
T4K, T4m, T4B;
T4G = fma (KP062914667, T4a, T4d);
T4e = fma (-KP062914667, T4d, T4a);
T4l = fma (-KP634619297, T4k, T4h);
T4H = fma (KP634619297, T4h, T4k);
T4J = fma (KP470564281, T4p, T4s);
T4t = fma (-KP470564281, T4s, T4p);
T4A = fma (-KP549754652, T4z, T4w);
T4K = fma (KP549754652, T4w, T4z);
T4R = fma (-KP845997307, T4l, T4e);
T4m = fma (KP845997307, T4l, T4e);
T4Q = fma (-KP968479752, T4A, T4t);
T4B = fma (KP968479752, T4A, T4t);
T4C = fma (-KP906616052, T4B, T4m);
T4W = fma (KP906616052, T4B, T4m);
T4Z = fma (-KP845997307, T4H, T4G);
T4I = fma (KP845997307, T4H, T4G);
T4L = fma (KP968479752, T4K, T4J);
T4Y = fma (-KP968479752, T4K, T4J);
}
T4O = fma (-KP906616052, T4L, T4I);
T4M = fma (KP906616052, T4L, T4I);
T4S = fma (-KP681693190, T4R, T4Q);
T4U = fma (KP560319534, T4Q, T4R);
T50 = fma (KP681693190, T4Z, T4Y);
T52 = fma (-KP560319534, T4Y, T4Z);
T4N = fma (-KP249506682, T4M, T4F);
v2.x = fma (-KP998026728, T4C, T47);
v2.y = fma (KP998026728, T4M, T4F);
T4V = fma (KP249506682, T4C, T47);
T4P = fma (-KP557913902, T4O, T4N);
T4T = fma (KP557913902, T4O, T4N);
}
{
real_t T3S, T3T, T3G, T41, T3K, T3N, T40, T51,
T4X, T3Q, T3O, T3P;
T51 = fma (-KP557913902, T4W, T4V);
T4X = fma (KP557913902, T4W, T4V);
v12.y = fma (-KP949179823, T4U, T4T);
v12.x = fma (-KP949179823, T52, T51);
v17.x = fma (KP949179823, T52, T51);
v17.y = fma (KP949179823, T4U, T4T);
v7.y = fma (-KP860541664, T4S, T4P);
v7.x = fma (KP860541664, T50, T4X);
v22.x = fma (-KP860541664, T50, T4X);
v22.y = fma (KP860541664, T4S, T4P);
{
real_t T3I, T3u, T3x, T3J, T3L, T3B, T3E,
T3M, T3y, T3F;
T3I = fma (-KP126329378, T3s, T3t);
T3u = fma (KP126329378, T3t, T3s);
T3x = fma (KP470564281, T3w, T3v);
T3J = fma (-KP470564281, T3v, T3w);
T3L = fma (KP634619297, T3z, T3A);
T3B = fma (-KP634619297, T3A, T3z);
T3E = fma (KP827271945, T3D, T3C);
T3M = fma (-KP827271945, T3C, T3D);
T3S = fma (-KP912018591, T3x, T3u);
T3y = fma (KP912018591, T3x, T3u);
T3T = fma (-KP912575812, T3E, T3B);
T3F = fma (KP912575812, T3E, T3B);
T3G = fma (KP851038619, T3F, T3y);
T3Y = fma (-KP851038619, T3F, T3y);
T41 = fma (KP912018591, T3J, T3I);
T3K = fma (-KP912018591, T3J, T3I);
T3N = fma (-KP912575812, T3M, T3L);
T40 = fma (KP912575812, T3M, T3L);
}
T3Q = fma (KP851038619, T3N, T3K);
T3O = fma (-KP851038619, T3N, T3K);
T3U = fma (KP525970792, T3T, T3S);
T3W = fma (-KP726211448, T3S, T3T);
T42 = fma (KP726211448, T41, T40);
T44 = fma (-KP525970792, T40, T41);
T3P = fma (KP248028675, T3O, T3H);
v4.x = fma (-KP992114701, T3G, T3r);
v4.y = fma (-KP992114701, T3O, T3H);
T3X = fma (KP248028675, T3G, T3r);
T3R = fma (-KP554608978, T3Q, T3P);
T3V = fma (KP554608978, T3Q, T3P);
}
}
}
}
}
T3Z = fma (KP554608978, T3Y, T3X);
T43 = fma (-KP554608978, T3Y, T3X);
v19.y = fma (KP943557151, T3U, T3R);
v19.x = fma (-KP943557151, T44, T43);
v14.x = fma (KP943557151, T44, T43);
v14.y = fma (-KP943557151, T3U, T3R);
v24.y = fma (KP803003575, T3W, T3V);
v24.x = fma (KP803003575, T42, T3Z);
v9.x = fma (-KP803003575, T42, T3Z);
v9.y = fma (-KP803003575, T3W, T3V);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
*u9 = v9;
*u10 = v10;
*u11 = v11;
*u12 = v12;
*u13 = v13;
*u14 = v14;
*u15 = v15;
*u16 = v16;
*u17 = v17;
*u18 = v18;
*u19 = v19;
*u20 = v20;
*u21 = v21;
*u22 = v22;
*u23 = v23;
*u24 = v24;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 25;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
double2 v9 = x[9 * threads];
double2 v10 = x[10 * threads];
double2 v11 = x[11 * threads];
double2 v12 = x[12 * threads];
double2 v13 = x[13 * threads];
double2 v14 = x[14 * threads];
double2 v15 = x[15 * threads];
double2 v16 = x[16 * threads];
double2 v17 = x[17 * threads];
double2 v18 = x[18 * threads];
double2 v19 = x[19 * threads];
double2 v20 = x[20 * threads];
double2 v21 = x[21 * threads];
double2 v22 = x[22 * threads];
double2 v23 = x[23 * threads];
double2 v24 = x[24 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.2513274122871835 * k / p));
v2 = mul(v2, twiddle((double)-0.5026548245743669 * k / p));
v3 = mul(v3, twiddle((double)-0.7539822368615503 * k / p));
v4 = mul(v4, twiddle((double)-1.005309649148734 * k / p));
v5 = mul(v5, twiddle((double)-1.256637061435917 * k / p));
v6 = mul(v6, twiddle((double)-1.507964473723101 * k / p));
v7 = mul(v7, twiddle((double)-1.759291886010284 * k / p));
v8 = mul(v8, twiddle((double)-2.010619298297468 * k / p));
v9 = mul(v9, twiddle((double)-2.261946710584651 * k / p));
v10 = mul(v10, twiddle((double)-2.513274122871834 * k / p));
v11 = mul(v11, twiddle((double)-2.764601535159018 * k / p));
v12 = mul(v12, twiddle((double)-3.015928947446201 * k / p));
v13 = mul(v13, twiddle((double)-3.267256359733385 * k / p));
v14 = mul(v14, twiddle((double)-3.518583772020568 * k / p));
v15 = mul(v15, twiddle((double)-3.769911184307751 * k / p));
v16 = mul(v16, twiddle((double)-4.021238596594936 * k / p));
v17 = mul(v17, twiddle((double)-4.272566008882119 * k / p));
v18 = mul(v18, twiddle((double)-4.523893421169302 * k / p));
v19 = mul(v19, twiddle((double)-4.775220833456485 * k / p));
v20 = mul(v20, twiddle((double)-5.026548245743669 * k / p));
v21 = mul(v21, twiddle((double)-5.277875658030853 * k / p));
v22 = mul(v22, twiddle((double)-5.529203070318036 * k / p));
v23 = mul(v23, twiddle((double)-5.780530482605219 * k / p));
v24 = mul(v24, twiddle((double)-6.031857894892402 * k / p));
}
dft25(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15, &v16, &v17, &v18, &v19, &v20, &v21, &v22, &v23, &v24);
const size_t j = k + (i - k) * 25;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
y[9 * p] = v9;
y[10 * p] = v10;
y[11 * p] = v11;
y[12 * p] = v12;
y[13 * p] = v13;
y[14 * p] = v14;
y[15 * p] = v15;
y[16 * p] = v16;
y[17 * p] = v17;
y[18 * p] = v18;
y[19 * p] = v19;
y[20 * p] = v20;
y[21 * p] = v21;
y[22 * p] = v22;
y[23 * p] = v23;
y[24 * p] = v24;
}
/tmp/vexcl/tests/fft.cpp:94: error in "test_dimensions": absolute value of rms(back, inp){0.36103911645716541} exceeds 1e-08
FFT(C2C) size=9x1x8 batch=2
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 52 FP additions, 8 FP multiplications,
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
* 43 stack variables, 1 constants, and 32 memory accesses
*/
DEVICE void
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
{
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t TF, TE, TD, TI;
{
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq,
Tt;
real_t TM;
{
real_t Tj, T4, T5, Tk;
{
real_t Tg, T1, T2, Th;
Tg = v0.y;
T1 = v0.x;
T2 = v4.x;
Th = v4.y;
Tj = v2.y;
T4 = v2.x;
Tn = T1 - T2;
T3 = T1 + T2;
TC = Tg - Th;
Ti = Tg + Th;
T5 = v6.x;
Tk = v6.y;
}
{
real_t Tw, Tb, Tc, Tx;
Tw = v7.y;
Tb = v7.x;
TB = T4 - T5;
T6 = T4 + T5;
To = Tj - Tk;
Tl = Tj + Tk;
Tc = v3.x;
Tx = v3.y;
{
real_t Tr, T8, Tv, Ty, T9, Ts;
Tr = v1.y;
T8 = v1.x;
Td = Tb + Tc;
Tv = Tb - Tc;
TN = Tw + Tx;
Ty = Tw - Tx;
T9 = v5.x;
Ts = v5.y;
Tz = Tv - Ty;
TH = Tv + Ty;
Ta = T8 + T9;
Tq = T8 - T9;
Tt = Tr - Ts;
TM = Tr + Ts;
}
}
}
{
real_t TL, TG, Tu, Tf, Tm, TO;
{
real_t T7, Te, TP, TQ;
TL = T3 - T6;
T7 = T3 + T6;
TG = Tt - Tq;
Tu = Tq + Tt;
Te = Ta + Td;
Tf = Td - Ta;
Tm = Ti - Tl;
TP = Ti + Tl;
TQ = TM + TN;
TO = TM - TN;
v0.x = T7 + Te;
v0.y = TP + TQ;
v4.y = TP - TQ;
v4.x = T7 - Te;
}
{
real_t Tp, TA, TJ, TK;
TF = Tn - To;
Tp = Tn + To;
v2.y = Tf + Tm;
v2.x = TL + TO;
v6.x = TL - TO;
v6.y = Tm - Tf;
TA = Tu + Tz;
TE = Tz - Tu;
TD = TB + TC;
TJ = TC - TB;
TK = TG + TH;
TI = TG - TH;
v1.x = fma (KP707106781, TA, Tp);
v1.y = fma (KP707106781, TK, TJ);
v5.y = fma (-KP707106781, TK, TJ);
v5.x = fma (-KP707106781, TA, Tp);
}
}
}
v3.y = fma (KP707106781, TE, TD);
v3.x = fma (KP707106781, TI, TF);
v7.x = fma (-KP707106781, TI, TF);
v7.y = fma (-KP707106781, TE, TD);
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 8;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p));
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p));
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p));
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p));
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p));
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p));
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p));
}
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7);
const size_t j = k + (i - k) * 8;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 9 -name dft9 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 80 FP additions, 56 FP multiplications,
* (or, 24 additions, 0 multiplications, 56 fused multiply/add),
* 67 stack variables, 10 constants, and 36 memory accesses
*/
DEVICE void
dft9 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
real2_t v8 = *u8;
{
const real_t KP954188894 =
+0.954188894138671133499268364187245676532219158;
const real_t KP363970234 =
+0.363970234266202361351047882776834043890471784;
const real_t KP852868531 =
+0.852868531952443209628250963940074071936020296;
const real_t KP984807753 =
+0.984807753012208059366743024589523013670643252;
const real_t KP492403876 =
+0.492403876506104029683371512294761506835321626;
const real_t KP777861913 =
+0.777861913430206160028177977318626690410586096;
const real_t KP839099631 =
+0.839099631177280011763127298123181364687434283;
const real_t KP176326980 =
+0.176326980708464973471090386868618986121633062;
const real_t KP866025403 =
+0.866025403784438646763723170752936183471402627;
const real_t KP500000000 =
+0.500000000000000000000000000000000000000000000;
{
real_t T17, TV, T14, TY, T11, T15;
{
real_t Tm, TM, TL, T5, Tl, T1f, Tt, Tb, Ta, T1c, TI, TX, TF,
TW, Tp;
real_t Tc, Td, Tq;
{
real_t Th, T1, Ti, Tj, T4, T2, T3;
Th = v0.y;
T1 = v0.x;
T2 = v3.x;
Ti = v3.y;
Tj = v6.y;
T3 = v6.x;
T4 = T2 + T3;
Tm = T3 - T2;
{
real_t Tz, T6, TA, T7, T8, TB, Tk;
Tz = v1.y;
T6 = v1.x;
TM = Ti - Tj;
Tk = Ti + Tj;
TL = fma (-KP500000000, T4, T1);
T5 = T1 + T4;
Tl = fma (-KP500000000, Tk, Th);
T1f = Th + Tk;
TA = v4.y;
T7 = v4.x;
T8 = v7.x;
TB = v7.y;
{
real_t TE, T9, TH, TC, TG, TD;
Tt = v2.y;
Tb = v2.x;
TE = T7 - T8;
T9 = T7 + T8;
TH = TB - TA;
TC = TA + TB;
Ta = T6 + T9;
TG = fma (-KP500000000, T9, T6);
T1c = Tz + TC;
TD = fma (-KP500000000, TC, Tz);
TI = fma (-KP866025403, TH, TG);
TX = fma (KP866025403, TH, TG);
TF = fma (-KP866025403, TE, TD);
TW = fma (KP866025403, TE, TD);
Tp = v5.y;
Tc = v5.x;
Td = v8.x;
Tq = v8.y;
}
}
}
{
real_t Tn, TN, TZ, T10, TO, Ty, TJ, TP;
{
real_t Tw, Te, Tu, Tr;
T17 = fma (-KP866025403, Tm, Tl);
Tn = fma (KP866025403, Tm, Tl);
Tw = Td - Tc;
Te = Tc + Td;
Tu = Tp + Tq;
Tr = Tp - Tq;
TN = fma (KP866025403, TM, TL);
TV = fma (-KP866025403, TM, TL);
{
real_t Tf, To, T1d, Tv;
Tf = Tb + Te;
To = fma (-KP500000000, Te, Tb);
T1d = Tt + Tu;
Tv = fma (-KP500000000, Tu, Tt);
{
real_t Ts, Tg, T1i, Tx;
Ts = fma (KP866025403, Tr, To);
TZ = fma (-KP866025403, Tr, To);
Tg = Ta + Tf;
T1i = Tf - Ta;
Tx = fma (KP866025403, Tw, Tv);
T10 = fma (-KP866025403, Tw, Tv);
{
real_t T1e, T1g, T1b, T1h;
T1e = T1c - T1d;
T1g = T1c + T1d;
T1b = fma (-KP500000000, Tg, T5);
v0.x = T5 + Tg;
v0.y = T1f + T1g;
T1h = fma (-KP500000000, T1g, T1f);
TO = fma (KP176326980, Ts, Tx);
Ty = fma (-KP176326980, Tx, Ts);
v6.x = fma (-KP866025403, T1e, T1b);
v6.y = fma (-KP866025403, T1i, T1h);
v3.y = fma (KP866025403, T1i, T1h);
v3.x = fma (KP866025403, T1e, T1b);
TJ = fma (-KP839099631, TI, TF);
TP = fma (KP839099631, TF, TI);
}
}
}
}
{
real_t TS, TK, TU, TQ, TT, TR;
TS = fma (KP777861913, TJ, Ty);
TK = fma (-KP777861913, TJ, Ty);
TU = fma (-KP777861913, TP, TO);
TQ = fma (KP777861913, TP, TO);
TT = fma (KP492403876, TK, Tn);
TR = fma (-KP492403876, TQ, TN);
v1.x = fma (KP984807753, TQ, TN);
v1.y = fma (-KP984807753, TK, Tn);
v7.y = fma (-KP852868531, TU, TT);
v7.x = fma (-KP852868531, TS, TR);
v4.x = fma (KP852868531, TS, TR);
v4.y = fma (KP852868531, TU, TT);
T14 = fma (-KP176326980, TW, TX);
TY = fma (KP176326980, TX, TW);
T11 = fma (-KP363970234, T10, TZ);
T15 = fma (KP363970234, TZ, T10);
}
}
}
{
real_t T12, T1a, T16, T18, T13, T19;
T12 = fma (-KP954188894, T11, TY);
T1a = fma (KP954188894, T11, TY);
T16 = fma (-KP954188894, T15, T14);
T18 = fma (KP954188894, T15, T14);
T13 = fma (-KP492403876, T12, TV);
T19 = fma (KP492403876, T18, T17);
v2.y = fma (-KP984807753, T18, T17);
v2.x = fma (KP984807753, T12, TV);
v8.x = fma (KP852868531, T16, T13);
v8.y = fma (KP852868531, T1a, T19);
v5.y = fma (-KP852868531, T1a, T19);
v5.x = fma (-KP852868531, T16, T13);
}
}
}
*u0 = v0;
*u1 = v1;
*u2 = v2;
*u3 = v3;
*u4 = v4;
*u5 = v5;
*u6 = v6;
*u7 = v7;
*u8 = v8;
}
kernel void radix
(
global const double2 * x,
global double2 * y,
uint p,
uint threads
)
{
const size_t i = get_global_id(0);
if(i >= threads) return;
const size_t k = i % p;
const size_t batch_offset = get_global_id(1) * threads * 9;
x += i + batch_offset;
double2 v0 = x[0 * threads];
double2 v1 = x[1 * threads];
double2 v2 = x[2 * threads];
double2 v3 = x[3 * threads];
double2 v4 = x[4 * threads];
double2 v5 = x[5 * threads];
double2 v6 = x[6 * threads];
double2 v7 = x[7 * threads];
double2 v8 = x[8 * threads];
if(p != 1)
{
v1 = mul(v1, twiddle((double)-0.6981317007977318 * k / p));
v2 = mul(v2, twiddle((double)-1.396263401595464 * k / p));
v3 = mul(v3, twiddle((double)-2.094395102393195 * k / p));
v4 = mul(v4, twiddle((double)-2.792526803190927 * k / p));
v5 = mul(v5, twiddle((double)-3.490658503988659 * k / p));
v6 = mul(v6, twiddle((double)-4.188790204786391 * k / p));
v7 = mul(v7, twiddle((double)-4.886921905584122 * k / p));
v8 = mul(v8, twiddle((double)-5.585053606381854 * k / p));
}
dft9(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8);
const size_t j = k + (i - k) * 9;
y += j + batch_offset;
y[0 * p] = v0;
y[1 * p] = v1;
y[2 * p] = v2;
y[3 * p] = v3;
y[4 * p] = v4;
y[5 * p] = v5;
y[6 * p] = v6;
y[7 * p] = v7;
y[8 * p] = v8;
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
kernel void transpose
(
global const double2 * input,
global double2 * output,
uint width,
uint height
)
{
const size_t global_x = get_global_id(0);
const size_t global_y = get_global_id(1);
const size_t local_x = get_local_id(0);
const size_t local_y = get_local_id(1);
const size_t group_x = get_group_id(0);
const size_t group_y = get_group_id(1);
const size_t target_x = local_y + group_y * 1;
const size_t target_y = local_x + group_x * 1;
const bool range = global_x < width && global_y < height;
local double2 block[1];
if(range) block[local_x + local_y * 1] = input[global_x + global_y * width];
barrier(CLK_LOCAL_MEM_FENCE);
if(range) output[target_x + target_y * height] = block[local_x + local_y * 1];
}
#define DEVICE
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
typedef double real_t;
typedef double2 real2_t;
double2 mul
(
double2 a,
double2 b
)
{
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y};
return r;
}
double2 twiddle
(
double alpha
)
{
double cs, sn = sincos(alpha, &cs);
double2 r = {cs, sn};
return r;
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */
/*
* This function contains 52 FP additions, 8 FP multiplications,
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
* 43 stack variables, 1 constants, and 32 memory accesses
*/
DEVICE void
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4,
real2_t * u5, real2_t * u6, real2_t * u7)
{
real2_t v0 = *u0;
real2_t v1 = *u1;
real2_t v2 = *u2;
real2_t v3 = *u3;
real2_t v4 = *u4;
real2_t v5 = *u5;
real2_t v6 = *u6;
real2_t v7 = *u7;
{
const real_t KP707106781 =
+0.707106781186547524400844362104849039284835938;
{
real_t TF, TE, TD, TI;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment