-
-
Save lajash/59d9a2f489d2aa05f1e9 to your computer and use it in GitHub Desktop.
Test Log for CPU tests on a MacPro
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Start testing: Jan 30 11:27 IST | |
---------------------------------------------------------- | |
1/30 Testing: boost_version | |
1/30 Test: boost_version | |
Command: "/tmp/vexcl/build/tests/boost_version" | |
Directory: /tmp/vexcl/build/tests | |
"boost_version" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
Boost version: 105600 | |
<end of output> | |
Test time = 0.07 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"boost_version" end time: Jan 30 11:27 IST | |
"boost_version" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
2/30 Testing: types | |
2/30 Test: types | |
Command: "/tmp/vexcl/build/tests/types" | |
Directory: /tmp/vexcl/build/tests | |
"types" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
Running 2 test cases... | |
*** No errors detected | |
<end of output> | |
Test time = 0.01 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"types" end time: Jan 30 11:27 IST | |
"types" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
3/30 Testing: deduce | |
3/30 Test: deduce | |
Command: "/tmp/vexcl/build/tests/deduce" | |
Directory: /tmp/vexcl/build/tests | |
"deduce" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597454 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 9 test cases... | |
terminal(5) | |
int | |
terminal(4.2) | |
double | |
terminal(N3vex15vector_terminalE) | |
double | |
terminal(N3vex15vector_terminalE) | |
int | |
terminal(N3vex15vector_terminalE) | |
double2 | |
terminal(N3vex10elem_indexE) | |
ulong | |
terminal(N3vex12mba_terminalE) | |
float | |
terminal(N3vex24tagged_terminal_terminalE) | |
double | |
terminal(N3vex18temporary_terminalE) | |
double | |
terminal(N3vex20vector_view_terminalE) | |
int | |
less( | |
terminal(N3vex15vector_terminalE) | |
, terminal(N3vex15vector_terminalE) | |
) | |
long | |
greater( | |
terminal(5) | |
, function( | |
terminal(N3vex8pow_funcE) | |
, terminal(N3vex15vector_terminalE) | |
, multiplies( | |
terminal(2) | |
, terminal(N3vex15vector_terminalE) | |
) | |
) | |
) | |
long | |
logical_not( | |
terminal(N3vex15vector_terminalE) | |
) | |
long | |
plus( | |
terminal(N3vex15vector_terminalE) | |
, terminal(N3vex15vector_terminalE) | |
) | |
double | |
plus( | |
terminal(N3vex15vector_terminalE) | |
, multiplies( | |
terminal(2) | |
, terminal(N3vex15vector_terminalE) | |
) | |
) | |
double | |
negate( | |
terminal(N3vex15vector_terminalE) | |
) | |
int | |
multiplies( | |
terminal(N3vex15vector_terminalE) | |
, terminal(N3vex15vector_terminalE) | |
) | |
double2 | |
multiplies( | |
terminal(N3vex15vector_terminalE) | |
, terminal(N3vex15vector_terminalE) | |
) | |
double2 | |
multiplies( | |
terminal(N3vex15vector_terminalE) | |
, terminal(N3vex15vector_terminalE) | |
) | |
double2 | |
function( | |
terminal(ZN2cr14user_functions11test_methodEvE15vex_function_f1) | |
, terminal(N3vex15vector_terminalE) | |
) | |
double | |
function( | |
terminal(ZN2cr14user_functions11test_methodEvE15vex_function_f2) | |
, terminal(N3vex15vector_terminalE) | |
, terminal(N3vex15vector_terminalE) | |
) | |
int | |
function( | |
terminal(ZN2cr14user_functions11test_methodEvE15vex_function_f2) | |
, plus( | |
terminal(N3vex15vector_terminalE) | |
, terminal(N3vex15vector_terminalE) | |
) | |
, minus( | |
terminal(N3vex15vector_terminalE) | |
, terminal(N3vex15vector_terminalE) | |
) | |
) | |
int | |
if_else_( | |
less( | |
terminal(N3vex15vector_terminalE) | |
, terminal(0) | |
) | |
, terminal(1) | |
, terminal(N3vex15vector_terminalE) | |
) | |
int | |
dereference( | |
if_else_( | |
less( | |
terminal(N3vex15vector_terminalE) | |
, terminal(0) | |
) | |
, address_of( | |
terminal(N3vex15vector_terminalE) | |
) | |
, address_of( | |
terminal(N3vex15vector_terminalE) | |
) | |
) | |
) | |
double | |
minus( | |
function( | |
terminal(N3vex8cos_funcE) | |
, terminal(N3vex15vector_terminalE) | |
) | |
, function( | |
terminal(N3vex8sin_funcE) | |
, terminal(N3vex15vector_terminalE) | |
) | |
) | |
double | |
function( | |
terminal(N3vex8pow_funcE) | |
, terminal(N3vex15vector_terminalE) | |
, multiplies( | |
terminal(2) | |
, terminal(N3vex15vector_terminalE) | |
) | |
) | |
double | |
terminal(N3vex28reduced_vector_view_terminalE) | |
double | |
terminal(N3vex13cast_terminalE) | |
double | |
terminal(N3vex13cast_terminalE) | |
int | |
*** No errors detected | |
<end of output> | |
Test time = 0.06 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"deduce" end time: Jan 30 11:27 IST | |
"deduce" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
4/30 Testing: context | |
4/30 Test: context | |
Command: "/tmp/vexcl/build/tests/context" | |
Directory: /tmp/vexcl/build/tests | |
"context" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
Running 1 test case... | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.18 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"context" end time: Jan 30 11:27 IST | |
"context" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
5/30 Testing: vector_create | |
5/30 Test: vector_create | |
Command: "/tmp/vexcl/build/tests/vector_create" | |
Directory: /tmp/vexcl/build/tests | |
"vector_create" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597454 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 14 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2[idx]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global uint * prm_1, | |
ulong prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global uint * prm_1, | |
ulong prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] -= prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = sin( prm_2[idx] ); | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.29 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"vector_create" end time: Jan 30 11:27 IST | |
"vector_create" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
6/30 Testing: vector_copy | |
6/30 Test: vector_copy | |
Command: "/tmp/vexcl/build/tests/vector_copy" | |
Directory: /tmp/vexcl/build/tests | |
"vector_copy" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597455 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 8 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong MAX_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global ulong * prm_1, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MAX_ulong(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_expr_1, | |
global ulong * prm_2_slice_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[prm_2_slice_1[idx]]; | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.26 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"vector_copy" end time: Jan 30 11:27 IST | |
"vector_copy" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
7/30 Testing: vector_arithmetics | |
7/30 Test: vector_arithmetics | |
Command: "/tmp/vexcl/build/tests/vector_arithmetics" | |
Directory: /tmp/vexcl/build/tests | |
"vector_arithmetics" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597455 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 18 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2, | |
global double * prm_3, | |
global double * prm_4 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( ( prm_2 * sin( prm_3[idx] ) ) + prm_4[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] += prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] -= prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double SUM_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_double(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double SUM_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)0, c = (double)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double y = (prm_1[idx]) - c; | |
double t = mySum + y; | |
c = (t - mySum) - y; | |
mySum = t; | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double MIN_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 < prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)1.79769e+308; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MIN_double(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double MAX_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)-1.79769e+308; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MAX_double(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double MAX_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)-1.79769e+308; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MAX_double(mySum, fabs( ( prm_1[idx] - prm_2[idx] ) )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
double prm_3, | |
global double * prm_4, | |
double prm_5 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( pow( sin( prm_2[idx] ), prm_3 ) + pow( cos( prm_4[idx] ), prm_5 ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong SUM_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
ulong greater | |
( | |
double x, | |
double y | |
) | |
{ | |
return x > y; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_ulong(mySum, greater( prm_1[idx], prm_2[idx] )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong SUM_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
double times2 | |
( | |
double x | |
) | |
{ | |
return x * 2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_ulong(mySum, times2( prm_1[idx] )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong SUM_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
double times4 | |
( | |
double x | |
) | |
{ | |
return x * 4; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_ulong(mySum, times4( prm_1[idx] )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
double prm_2, | |
ulong prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = sin( ( prm_2 * (prm_3 + idx) ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int4 make_int4 | |
( | |
int x | |
) | |
{ | |
return (int4)(x, x, x, x); | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int4 * prm_1, | |
int4 prm_2, | |
int prm_3, | |
ulong prm_4 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2 * make_int4( ( prm_3 + (prm_4 + idx) ) ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int4 make_int4 | |
( | |
int x | |
) | |
{ | |
return (int4)(x, x, x, x); | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int4 * prm_1, | |
int4 prm_2, | |
int prm_3, | |
ulong prm_4 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2 * make_int4( ( prm_3 + (prm_4 + idx) ) ) ); | |
} | |
} | |
<program source>:10:1: error: too few arguments provided to function-like macro invocation | |
) | |
^ | |
<program source>:7:6: error: global variables must have a constant address space qualifier | |
int4 make_int4 | |
^ | |
<program source>:7:15: error: expected ';' after top level declarator | |
int4 make_int4 | |
^ | |
; | |
<program source>:29:65: error: too few arguments provided to function-like macro invocation | |
prm_1[idx] = ( prm_2 * make_int4( ( prm_3 + (prm_4 + idx) ) ) ); | |
^ | |
unknown location:0: fatal error in "vector_values": std::exception: clBuildProgram | |
/tmp/vexcl/tests/vector_arithmetics.cpp:142: last checkpoint | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int f | |
( | |
int x | |
) | |
{ | |
return 2 * x; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = f( f( prm_2[idx] ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int g | |
( | |
int x | |
) | |
{ | |
return 3 * x; | |
} | |
int f | |
( | |
int x | |
) | |
{ | |
return 2 * x; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = g( f( prm_2[idx] ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
#define THE_ANSWER 42 | |
int answer | |
( | |
int x | |
) | |
{ | |
return x * THE_ANSWER; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = answer( prm_2 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double sin2 | |
( | |
double x | |
) | |
{ | |
return pow(sin(x), 2.0); | |
} | |
double cos2 | |
( | |
double x | |
) | |
{ | |
return pow(cos(x), 2.0); | |
} | |
double one | |
( | |
double x | |
) | |
{ | |
return sin2(x) + cos2(x); | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = one( prm_2[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
double prm_3, | |
global double * prm_4, | |
global double * prm_5 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( ( prm_2[idx] > prm_3 ) ? sin( prm_4[idx] ) : cos( prm_5[idx] ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
double prm_2, | |
global double * prm_3, | |
global double * prm_4, | |
int prm_5 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
( *( ( ( prm_1[idx] < prm_2 ) ? ( &( prm_3[idx] ) ) : ( &( prm_4[idx] ) ) ) ) ) = prm_5; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
ulong prm_tag_1_1, | |
double prm_4, | |
double prm_7 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( pow( sin( ( ( 6.2831853071795862e+00 ) * (prm_tag_1_1 + idx) ) ), prm_4 ) + pow( cos( ( ( 6.2831853071795862e+00 ) * (prm_tag_1_1 + idx) ) ), prm_7 ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = 42; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = 42; | |
} | |
} | |
CVMS_ERROR_COMPILER_FAILURE: CVMS compiler has crashed or hung building an element. | |
unknown location:0: fatal error in "constants": std::exception: clBuildProgram | |
/tmp/vexcl/tests/vector_arithmetics.cpp:270: last checkpoint | |
*** 2 failures detected in test suite "VectorArithmetics" | |
<end of output> | |
Test time = 3.07 sec | |
---------------------------------------------------------- | |
Test Failed. | |
"vector_arithmetics" end time: Jan 30 11:27 IST | |
"vector_arithmetics" time elapsed: 00:00:03 | |
---------------------------------------------------------- | |
8/30 Testing: vector_view | |
8/30 Test: vector_view | |
Command: "/tmp/vexcl/build/tests/vector_view" | |
Directory: /tmp/vexcl/build/tests | |
"vector_view" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597458 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 16 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_2_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong idx | |
) | |
{ | |
return start + idx * stride0; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_expr_1, | |
ulong prm_2_slice_start, | |
ulong prm_2_slice_length0, | |
long prm_2_slice_stride0 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, idx)]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_2_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong idx | |
) | |
{ | |
return start + idx * stride0; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_expr_1, | |
global double * prm_2_expr_2, | |
ulong prm_2_slice_start, | |
ulong prm_2_slice_length0, | |
long prm_2_slice_stride0 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double prm_2_val; | |
{ | |
size_t pos = prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, idx); | |
size_t idx = pos; | |
prm_2_val = ( prm_2_expr_1[idx] * prm_2_expr_2[idx] ); | |
} | |
prm_1[idx] = prm_2_val; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_2_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong length1, | |
long stride1, | |
ulong idx | |
) | |
{ | |
size_t ptr = start + (idx % length1) * stride1; | |
idx /= length1; | |
ptr += (idx % length0) * stride0; | |
return ptr; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_expr_1, | |
ulong prm_2_slice_start, | |
ulong prm_2_slice_length0, | |
long prm_2_slice_stride0, | |
ulong prm_2_slice_length1, | |
long prm_2_slice_stride1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, prm_2_slice_length1, prm_2_slice_stride1, idx)]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_2_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong length1, | |
long stride1, | |
ulong idx | |
) | |
{ | |
size_t ptr = start + (idx % length1) * stride1; | |
idx /= length1; | |
ptr += (idx % length0) * stride0; | |
return ptr; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2_expr_1, | |
ulong prm_2_slice_start, | |
ulong prm_2_slice_length0, | |
long prm_2_slice_stride0, | |
ulong prm_2_slice_length1, | |
long prm_2_slice_stride1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, prm_2_slice_length1, prm_2_slice_stride1, idx)]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global ulong * prm_1, | |
ulong prm_2, | |
ulong prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2 - (prm_3 + idx) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong MAX_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global ulong * prm_1, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MAX_ulong(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_expr_1, | |
global ulong * prm_2_slice_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[prm_2_slice_1[idx]]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_expr_1, | |
global double * prm_2_expr_2, | |
global ulong * prm_2_slice_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double prm_2_val; | |
{ | |
size_t pos = prm_2_slice_1[idx]; | |
size_t idx = pos; | |
prm_2_val = ( prm_2_expr_1[idx] * prm_2_expr_2[idx] ); | |
} | |
prm_1[idx] = prm_2_val; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong MAX_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
ulong prm_1, | |
ulong prm_2, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MAX_ulong(mySum, ( prm_1 - (prm_2 + idx) )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_expr_1, | |
ulong prm_2_slice_1, | |
ulong prm_2_slice_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[( prm_2_slice_1 - (prm_2_slice_2 + idx) )]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1_expr_1, | |
ulong prm_1_slice_1, | |
ulong prm_1_slice_2, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1_expr_1[( prm_1_slice_1 - (prm_1_slice_2 + idx) )] = prm_2[idx]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_expr_1, | |
ulong prm_2_slice_1, | |
ulong prm_2_slice_2_1, | |
int prm_2_slice_2_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
ulong temp_1 = ( (prm_2_slice_2_1 + idx) + prm_2_slice_2_2 ); | |
prm_1[idx] = prm_2_expr_1[( prm_2_slice_1 - temp_1 )]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int SUM_int | |
( | |
int prm1, | |
int prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
ulong prm_1_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong idx | |
) | |
{ | |
return start + idx * stride0; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global int * prm_1_expr_1, | |
ulong prm_1_slice_start, | |
ulong prm_1_slice_length0, | |
long prm_1_slice_stride0, | |
global int * g_odata | |
) | |
{ | |
int mySum = (int)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_int(mySum, prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, idx)]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_1_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong idx | |
) | |
{ | |
return start + idx * stride0; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1_expr_1, | |
ulong prm_1_slice_start, | |
ulong prm_1_slice_length0, | |
long prm_1_slice_stride0, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, idx)] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_1_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong length1, | |
long stride1, | |
ulong idx | |
) | |
{ | |
size_t ptr = start + (idx % length1) * stride1; | |
idx /= length1; | |
ptr += (idx % length0) * stride0; | |
return ptr; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1_expr_1, | |
ulong prm_1_slice_start, | |
ulong prm_1_slice_length0, | |
long prm_1_slice_stride0, | |
ulong prm_1_slice_length1, | |
long prm_1_slice_stride1, | |
ulong prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, prm_1_slice_length1, prm_1_slice_stride1, idx)] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_1_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong length1, | |
long stride1, | |
ulong idx | |
) | |
{ | |
size_t ptr = start + (idx % length1) * stride1; | |
idx /= length1; | |
ptr += (idx % length0) * stride0; | |
return ptr; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1_expr_1, | |
ulong prm_1_slice_start, | |
ulong prm_1_slice_length0, | |
long prm_1_slice_stride0, | |
ulong prm_1_slice_length1, | |
long prm_1_slice_stride1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, prm_1_slice_length1, prm_1_slice_stride1, idx)] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global ulong * prm_1, | |
ulong prm_2, | |
ulong prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( (prm_2 + idx) * prm_3 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1_expr_1, | |
global ulong * prm_1_slice_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1_expr_1[prm_1_slice_1[idx]] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int SUM_int | |
( | |
int prm1, | |
int prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2_1, | |
ulong prm_2_start, | |
ulong prm_2_length0, | |
long prm_2_stride0 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
int prm_2_sum = (int)0; | |
{ | |
size_t pos = idx; | |
size_t ptr0 = prm_2_start; | |
for(size_t i0 = 0, ptr1 = ptr0; i0 < prm_2_length0; ++i0, ptr1 += prm_2_stride0) | |
{ | |
size_t idx = ptr1; | |
prm_2_sum = SUM_int(prm_2_sum, prm_2_1[idx]); | |
} | |
} | |
prm_1[idx] = prm_2_sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int SUM_int | |
( | |
int prm1, | |
int prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2_1, | |
ulong prm_2_start, | |
ulong prm_2_length0, | |
long prm_2_stride0, | |
ulong prm_2_length1, | |
long prm_2_stride1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
int prm_2_sum = (int)0; | |
{ | |
size_t pos = idx; | |
size_t ptr0 = prm_2_start; | |
for(size_t i0 = 0, ptr1 = ptr0; i0 < prm_2_length0; ++i0, ptr1 += prm_2_stride0) | |
{ | |
for(size_t i1 = 0, ptr2 = ptr1; i1 < prm_2_length1; ++i1, ptr2 += prm_2_stride1) | |
{ | |
size_t idx = ptr2; | |
prm_2_sum = SUM_int(prm_2_sum, prm_2_1[idx]); | |
} | |
} | |
} | |
prm_1[idx] = prm_2_sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_1_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong length1, | |
long stride1, | |
ulong idx | |
) | |
{ | |
size_t ptr = start + (idx % length1) * stride1; | |
idx /= length1; | |
ptr += (idx % length0) * stride0; | |
return ptr; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1_expr_1, | |
ulong prm_1_slice_start, | |
ulong prm_1_slice_length0, | |
long prm_1_slice_stride0, | |
ulong prm_1_slice_length1, | |
long prm_1_slice_stride1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, prm_1_slice_length1, prm_1_slice_stride1, idx)] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int SUM_int | |
( | |
int prm1, | |
int prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2_1, | |
ulong prm_2_start, | |
ulong prm_2_length0, | |
long prm_2_stride0, | |
ulong prm_2_length1, | |
long prm_2_stride1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
int prm_2_sum = (int)0; | |
{ | |
size_t pos = idx; | |
size_t ptr1 = prm_2_start + (pos % prm_2_length0) * prm_2_stride0; | |
for(size_t i1 = 0, ptr2 = ptr1; i1 < prm_2_length1; ++i1, ptr2 += prm_2_stride1) | |
{ | |
size_t idx = ptr2; | |
prm_2_sum = SUM_int(prm_2_sum, prm_2_1[idx]); | |
} | |
} | |
prm_1[idx] = prm_2_sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int SUM_int | |
( | |
int prm1, | |
int prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2_1_1, | |
ulong prm_2_start, | |
ulong prm_2_length0, | |
long prm_2_stride0, | |
ulong prm_2_length1, | |
long prm_2_stride1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
int prm_2_sum = (int)0; | |
{ | |
size_t pos = idx; | |
size_t ptr1 = prm_2_start + (pos % prm_2_length0) * prm_2_stride0; | |
for(size_t i1 = 0, ptr2 = ptr1; i1 < prm_2_length1; ++i1, ptr2 += prm_2_stride1) | |
{ | |
size_t idx = ptr2; | |
int temp_1 = prm_2_1_1[idx]; | |
prm_2_sum = SUM_int(prm_2_sum, ( temp_1 * temp_1 )); | |
} | |
} | |
prm_1[idx] = prm_2_sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int SUM_int | |
( | |
int prm1, | |
int prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2_1, | |
ulong prm_2_start, | |
ulong prm_2_length0, | |
long prm_2_stride0, | |
ulong prm_2_length1, | |
long prm_2_stride1, | |
ulong prm_2_length2, | |
long prm_2_stride2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
int prm_2_sum = (int)0; | |
{ | |
size_t pos = idx; | |
size_t ptr1 = prm_2_start + (pos % prm_2_length0) * prm_2_stride0; | |
for(size_t i1 = 0, ptr2 = ptr1; i1 < prm_2_length1; ++i1, ptr2 += prm_2_stride1) | |
{ | |
for(size_t i2 = 0, ptr3 = ptr2; i2 < prm_2_length2; ++i2, ptr3 += prm_2_stride2) | |
{ | |
size_t idx = ptr3; | |
prm_2_sum = SUM_int(prm_2_sum, prm_2_1[idx]); | |
} | |
} | |
} | |
prm_1[idx] = prm_2_sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double SUM_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
double MAX_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_1_1, | |
ulong prm_2_1_start, | |
ulong prm_2_1_length0, | |
long prm_2_1_stride0, | |
ulong prm_2_1_length1, | |
long prm_2_1_stride1, | |
ulong prm_2_1_length2, | |
long prm_2_1_stride2, | |
ulong prm_2_start, | |
ulong prm_2_length0, | |
long prm_2_stride0, | |
ulong prm_2_length1, | |
long prm_2_stride1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double prm_2_sum = (double)-1.79769e+308; | |
{ | |
size_t pos = idx; | |
size_t ptr1 = prm_2_start + (pos % prm_2_length0) * prm_2_stride0; | |
for(size_t i1 = 0, ptr2 = ptr1; i1 < prm_2_length1; ++i1, ptr2 += prm_2_stride1) | |
{ | |
size_t idx = ptr2; | |
double prm_2_1_sum = (double)0; | |
{ | |
size_t pos = idx; | |
size_t ptr2 = prm_2_1_start + (pos % prm_2_1_length1) * prm_2_1_stride1; | |
pos /= prm_2_1_length1; | |
ptr2 += (pos % prm_2_1_length0) * prm_2_1_stride0; | |
for(size_t i2 = 0, ptr3 = ptr2; i2 < prm_2_1_length2; ++i2, ptr3 += prm_2_1_stride2) | |
{ | |
size_t idx = ptr3; | |
prm_2_1_sum = SUM_double(prm_2_1_sum, sin( prm_2_1_1[idx] )); | |
} | |
} | |
prm_2_sum = MAX_double(prm_2_sum, prm_2_1_sum); | |
} | |
} | |
prm_1[idx] = prm_2_sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
ulong prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = (prm_2 + idx); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong MAX_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
ulong prm_1, | |
ulong prm_2, | |
ulong prm_3, | |
ulong prm_4, | |
ulong prm_5, | |
ulong prm_6, | |
ulong prm_7, | |
ulong prm_8, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MAX_ulong(mySum, ( ( prm_1 * ( ( (prm_2 + idx) / prm_3 ) % prm_4 ) ) + ( prm_5 * ( ( (prm_6 + idx) / prm_7 ) % prm_8 ) ) )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2_expr_1, | |
ulong prm_2_slice_1, | |
ulong prm_2_slice_2, | |
ulong prm_2_slice_3, | |
ulong prm_2_slice_4, | |
ulong prm_2_slice_5, | |
ulong prm_2_slice_6, | |
ulong prm_2_slice_7, | |
ulong prm_2_slice_8 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[( ( prm_2_slice_1 * ( ( (prm_2_slice_2 + idx) / prm_2_slice_3 ) % prm_2_slice_4 ) ) + ( prm_2_slice_5 * ( ( (prm_2_slice_6 + idx) / prm_2_slice_7 ) % prm_2_slice_8 ) ) )]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_2_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong length1, | |
long stride1, | |
ulong idx | |
) | |
{ | |
size_t ptr = start + (idx % length1) * stride1; | |
idx /= length1; | |
ptr += (idx % length0) * stride0; | |
return ptr; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2_expr_1, | |
ulong prm_2_slice_start, | |
ulong prm_2_slice_length0, | |
long prm_2_slice_stride0, | |
ulong prm_2_slice_length1, | |
long prm_2_slice_stride1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, prm_2_slice_length1, prm_2_slice_stride1, idx)]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong MAX_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
ulong prm_1, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MAX_ulong(mySum, (prm_1 + idx)); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2_expr_1, | |
ulong prm_2_slice_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[(prm_2_slice_1 + idx)]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2_expr_1, | |
ulong prm_2_slice_1, | |
ulong prm_2_slice_2, | |
ulong prm_2_slice_3, | |
ulong prm_2_slice_4, | |
ulong prm_2_slice_5, | |
ulong prm_2_slice_6, | |
ulong prm_2_slice_7, | |
ulong prm_2_slice_8, | |
ulong prm_2_slice_9, | |
ulong prm_2_slice_10, | |
ulong prm_2_slice_11, | |
ulong prm_2_slice_12, | |
ulong prm_2_slice_13, | |
ulong prm_2_slice_14, | |
ulong prm_2_slice_15, | |
ulong prm_2_slice_16 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[( ( prm_2_slice_1 * ( ( (prm_2_slice_2 + idx) / prm_2_slice_3 ) % prm_2_slice_4 ) ) + ( ( prm_2_slice_5 * ( ( (prm_2_slice_6 + idx) / prm_2_slice_7 ) % prm_2_slice_8 ) ) + ( ( prm_2_slice_9 * ( ( (prm_2_slice_10 + idx) / prm_2_slice_11 ) % prm_2_slice_12 ) ) + ( prm_2_slice_13 * ( ( (prm_2_slice_14 + idx) / prm_2_slice_15 ) % prm_2_slice_16 ) ) ) ) )]; | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 11.31 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"vector_view" end time: Jan 30 11:27 IST | |
"vector_view" time elapsed: 00:00:11 | |
---------------------------------------------------------- | |
9/30 Testing: vector_pointer | |
9/30 Test: vector_pointer | |
Command: "/tmp/vexcl/build/tests/vector_pointer" | |
Directory: /tmp/vexcl/build/tests | |
"vector_pointer" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597469 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 3 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double nbody | |
( | |
ulong n, | |
ulong j, | |
global double * x | |
) | |
{ | |
double sum = 0; for(size_t i = 0; i < n; ++i) if (i != j) sum += x[i]; return sum; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
ulong prm_2, | |
ulong prm_3, | |
global double * prm_4 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = nbody( prm_2, (prm_3 + idx), prm_4 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
ulong prm_3_1, | |
global double * prm_5, | |
global double * prm_7, | |
ulong prm_tag_1_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
ulong temp_1 = (prm_3_1 + idx); | |
ulong temp_2 = ( ( temp_1 > ( 0 ) ) ? ( temp_1 - ( 1 ) ) : temp_1 ); | |
ulong temp_3 = ( ( ( temp_1 + ( 1 ) ) < prm_tag_1_1 ) ? ( temp_1 + ( 1 ) ) : temp_1 ); | |
prm_1[idx] = ( ( ( ( *( ( prm_2 + temp_1 ) ) ) * ( 2 ) ) - ( *( ( prm_5 + temp_2 ) ) ) ) - ( *( ( prm_7 + temp_3 ) ) ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
ulong prm_3_1, | |
global double * prm_5, | |
global double * prm_7, | |
ulong prm_tag_1_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
ulong temp_1 = (prm_3_1 + idx); | |
ulong temp_2 = ( ( temp_1 > ( 0 ) ) ? ( temp_1 - ( 1 ) ) : temp_1 ); | |
ulong temp_3 = ( ( ( temp_1 + ( 1 ) ) < prm_tag_1_1 ) ? ( temp_1 + ( 1 ) ) : temp_1 ); | |
prm_1[idx] = ( ( ( ( ( prm_2 )[ temp_1 ] ) * ( 2 ) ) - ( ( prm_5 )[ temp_2 ] ) ) - ( ( prm_7 )[ temp_3 ] ) ); | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.17 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"vector_pointer" end time: Jan 30 11:27 IST | |
"vector_pointer" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
10/30 Testing: tagged_terminal | |
10/30 Test: tagged_terminal | |
Command: "/tmp/vexcl/build/tests/tagged_terminal" | |
Directory: /tmp/vexcl/build/tests | |
"tagged_terminal" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597469 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 5 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_tag_1_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_tag_1_1[idx] * prm_tag_1_1[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double SUM_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
int prm_tag_3_1, | |
global double * prm_tag_1_1, | |
global double * prm_tag_2_1, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_double(mySum, ( ( ( prm_tag_3_1 * prm_tag_1_1[idx] ) * prm_tag_1_1[idx] ) + ( ( prm_tag_3_1 * prm_tag_2_1[idx] ) * prm_tag_2_1[idx] ) )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_tag_1_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_tag_1_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_tag_1_1, | |
ulong prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_tag_1_1[idx] = ( prm_tag_1_1[idx] + (prm_3 + idx) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_tag_3_1_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong idx | |
) | |
{ | |
return start + idx * stride0; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_tag_1_1, | |
global double * prm_tag_2_1, | |
global double * prm_tag_3_1_expr_1, | |
ulong prm_tag_3_1_slice_start, | |
ulong prm_tag_3_1_slice_length0, | |
long prm_tag_3_1_slice_stride0 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( ( prm_tag_1_1 * prm_tag_2_1[idx] ) + prm_tag_3_1_expr_1[prm_tag_3_1_slice_func(prm_tag_3_1_slice_start, prm_tag_3_1_slice_length0, prm_tag_3_1_slice_stride0, idx)] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_tag_1_1_expr_1, | |
ulong prm_tag_1_1_slice_1, | |
ulong prm_tag_1_1_slice_2_1, | |
int prm_tag_1_1_slice_2_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
ulong temp_1 = ( (prm_tag_1_1_slice_2_1 + idx) + prm_tag_1_1_slice_2_2 ); | |
prm_1[idx] = prm_tag_1_1_expr_1[( prm_tag_1_1_slice_1 - temp_1 )]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_tag_1_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_tag_1_1[idx]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_tag_0_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_tag_0_1[idx]; | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.35 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"tagged_terminal" end time: Jan 30 11:27 IST | |
"tagged_terminal" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
11/30 Testing: temporary | |
11/30 Test: temporary | |
Command: "/tmp/vexcl/build/tests/temporary" | |
Directory: /tmp/vexcl/build/tests | |
"temporary" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597470 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 6 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double sqr | |
( | |
double x | |
) | |
{ | |
return x * x; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_1, | |
int prm_2_2, | |
global double * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double temp_1 = ( sqr( prm_2_1[idx] ) + prm_2_2 ); | |
prm_1[idx] = ( temp_1 * ( prm_3[idx] + temp_1 ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_1, | |
global double * prm_3_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double temp_1 = log( prm_2_1[idx] ); | |
double temp_2 = ( temp_1 + sin( prm_3_2[idx] ) ); | |
prm_1[idx] = ( temp_1 * temp_2 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double SUM_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
int prm_1, | |
global double * prm_2_1, | |
int prm_2_2, | |
global double * prm_3_1, | |
int prm_3_2, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double temp_1 = pow( sin( prm_2_1[idx] ), prm_2_2 ); | |
double temp_2 = pow( cos( prm_3_1[idx] ), prm_3_2 ); | |
mySum = SUM_double(mySum, ( prm_1 * ( temp_1 + temp_2 ) )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double temp_1 = sin( prm_2_1[idx] ); | |
prm_1[idx] = temp_1; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2, | |
global double * prm_3_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double temp_1 = sin( prm_3_1[idx] ); | |
prm_1[idx] = sqrt( ( prm_2 - ( temp_1 * temp_1 ) ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double temp_1000 = tan( prm_2_1[idx] ); | |
prm_1[idx] = ( temp_1000 * temp_1000 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double temp_1001 = tan( prm_2_1[idx] ); | |
prm_1[idx] = ( temp_1001 * temp_1001 ); | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.36 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"temporary" end time: Jan 30 11:27 IST | |
"temporary" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
12/30 Testing: cast | |
12/30 Test: cast | |
Command: "/tmp/vexcl/build/tests/cast" | |
Directory: /tmp/vexcl/build/tests | |
"cast" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597470 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 3 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_1; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int2 * prm_1, | |
float2 prm_2_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = convert_int2( prm_2_1 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int2 * prm_1, | |
float2 prm_2_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = as_int2( prm_2_1 ); | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.14 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"cast" end time: Jan 30 11:27 IST | |
"cast" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
13/30 Testing: multivector_create | |
13/30 Test: multivector_create | |
Command: "/tmp/vexcl/build/tests/multivector_create" | |
Directory: /tmp/vexcl/build/tests | |
"multivector_create" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597470 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 5 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2[idx]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global uint * prm_1, | |
ulong prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.05 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"multivector_create" end time: Jan 30 11:27 IST | |
"multivector_create" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
14/30 Testing: multivector_arithmetics | |
14/30 Test: multivector_arithmetics | |
Command: "/tmp/vexcl/build/tests/multivector_arithmetics" | |
Directory: /tmp/vexcl/build/tests | |
"multivector_arithmetics" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597470 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 11 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
double prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double MIN_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 < prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)1.79769e+308; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MIN_double(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double MAX_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)-1.79769e+308; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MAX_double(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2, | |
global double * prm_3, | |
global double * prm_4 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( ( prm_2 * prm_3[idx] ) + prm_4[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
global double * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( sin( prm_2[idx] ) + cos( prm_3[idx] ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
global double * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( cos( prm_2[idx] ) + sin( prm_3[idx] ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
global double * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
global double * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] - prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
double prm_3, | |
global double * prm_4, | |
double prm_5 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( pow( sin( prm_2[idx] ), prm_3 ) + pow( cos( prm_4[idx] ), prm_5 ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong greater | |
( | |
double x, | |
double y | |
) | |
{ | |
return x > y; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
global double * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = greater( prm_2[idx], prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double SUM_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_double(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
double prm_2, | |
ulong prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2 * (prm_3 + idx) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
double prm_2, | |
ulong prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = sin( ( prm_2 * (prm_3 + idx) ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
double prm_2, | |
ulong prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = cos( ( prm_2 * (prm_3 + idx) ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2, | |
global double * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] += sin( ( prm_2 * prm_3[idx] ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2, | |
global double * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] -= sin( ( prm_2 * prm_3[idx] ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] *= prm_2[idx]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] *= sin( prm_2[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = 42; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = 42; | |
} | |
} | |
CVMS_ERROR_COMPILER_FAILURE: CVMS compiler has crashed or hung building an element. | |
unknown location:0: fatal error in "integral_constants": std::exception: clBuildProgram | |
/tmp/vexcl/tests/multivector_arithmetics.cpp:209: last checkpoint | |
*** 1 failure detected in test suite "MultivectorArithmetics" | |
<end of output> | |
Test time = 0.61 sec | |
---------------------------------------------------------- | |
Test Failed. | |
"multivector_arithmetics" end time: Jan 30 11:27 IST | |
"multivector_arithmetics" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
15/30 Testing: multi_array | |
15/30 Test: multi_array | |
Command: "/tmp/vexcl/build/tests/multi_array" | |
Directory: /tmp/vexcl/build/tests | |
"multi_array" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597471 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 5 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
double prm_2, | |
ulong prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2 * (prm_3 + idx) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
double prm_3, | |
global double * prm_4, | |
double prm_5 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( pow( sin( prm_2[idx] ), prm_3 ) + pow( cos( prm_4[idx] ), prm_5 ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_1_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong length1, | |
long stride1, | |
ulong length2, | |
long stride2, | |
ulong idx | |
) | |
{ | |
size_t ptr = start + (idx % length2) * stride2; | |
idx /= length2; | |
ptr += (idx % length1) * stride1; | |
idx /= length1; | |
ptr += (idx % length0) * stride0; | |
return ptr; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1_expr_1, | |
ulong prm_1_slice_start, | |
ulong prm_1_slice_length0, | |
long prm_1_slice_stride0, | |
ulong prm_1_slice_length1, | |
long prm_1_slice_stride1, | |
ulong prm_1_slice_length2, | |
long prm_1_slice_stride2, | |
ulong prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, prm_1_slice_length1, prm_1_slice_stride1, prm_1_slice_length2, prm_1_slice_stride2, idx)] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong prm_1_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong length1, | |
long stride1, | |
ulong length2, | |
long stride2, | |
ulong idx | |
) | |
{ | |
size_t ptr = start + (idx % length2) * stride2; | |
idx /= length2; | |
ptr += (idx % length1) * stride1; | |
idx /= length1; | |
ptr += (idx % length0) * stride0; | |
return ptr; | |
} | |
ulong prm_2_slice_func | |
( | |
ulong start, | |
ulong length0, | |
long stride0, | |
ulong length1, | |
long stride1, | |
ulong length2, | |
long stride2, | |
ulong idx | |
) | |
{ | |
size_t ptr = start + (idx % length2) * stride2; | |
idx /= length2; | |
ptr += (idx % length1) * stride1; | |
idx /= length1; | |
ptr += (idx % length0) * stride0; | |
return ptr; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1_expr_1, | |
ulong prm_1_slice_start, | |
ulong prm_1_slice_length0, | |
long prm_1_slice_stride0, | |
ulong prm_1_slice_length1, | |
long prm_1_slice_stride1, | |
ulong prm_1_slice_length2, | |
long prm_1_slice_stride2, | |
global double * prm_2_expr_1, | |
ulong prm_2_slice_start, | |
ulong prm_2_slice_length0, | |
long prm_2_slice_stride0, | |
ulong prm_2_slice_length1, | |
long prm_2_slice_stride1, | |
ulong prm_2_slice_length2, | |
long prm_2_slice_stride2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1_expr_1[prm_1_slice_func(prm_1_slice_start, prm_1_slice_length0, prm_1_slice_stride0, prm_1_slice_length1, prm_1_slice_stride1, prm_1_slice_length2, prm_1_slice_stride2, idx)] = prm_2_expr_1[prm_2_slice_func(prm_2_slice_start, prm_2_slice_length0, prm_2_slice_stride0, prm_2_slice_length1, prm_2_slice_stride1, prm_2_slice_length2, prm_2_slice_stride2, idx)]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int SUM_int | |
( | |
int prm1, | |
int prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2_1, | |
ulong prm_2_start, | |
ulong prm_2_length0, | |
long prm_2_stride0, | |
ulong prm_2_length1, | |
long prm_2_stride1, | |
ulong prm_2_length2, | |
long prm_2_stride2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
int prm_2_sum = (int)0; | |
{ | |
size_t pos = idx; | |
size_t ptr2 = prm_2_start + (pos % prm_2_length1) * prm_2_stride1; | |
pos /= prm_2_length1; | |
ptr2 += (pos % prm_2_length0) * prm_2_stride0; | |
for(size_t i2 = 0, ptr3 = ptr2; i2 < prm_2_length2; ++i2, ptr3 += prm_2_stride2) | |
{ | |
size_t idx = ptr3; | |
prm_2_sum = SUM_int(prm_2_sum, prm_2_1[idx]); | |
} | |
} | |
prm_1[idx] = prm_2_sum; | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.17 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"multi_array" end time: Jan 30 11:27 IST | |
"multi_array" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
16/30 Testing: spmv | |
16/30 Test: spmv | |
Command: "/tmp/vexcl/build/tests/spmv" | |
Directory: /tmp/vexcl/build/tests | |
"spmv" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597471 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 12 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong MAX_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global ulong * prm_1, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MAX_ulong(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_expr_1, | |
global ulong * prm_2_slice_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[prm_2_slice_1[idx]]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void csr_spmv | |
( | |
ulong n, | |
double scale, | |
global const ulong * row, | |
global const ulong * col, | |
global const double * val, | |
global const double * in, | |
global double * out | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong i = chunk_start; i < chunk_end; ++i) | |
{ | |
double sum = 0; | |
for(size_t j = row[i], e = row[i + 1]; j < e; ++j) | |
{ | |
sum += val[j] * in[col[j]]; | |
} | |
out[i] = scale * sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void csr_spmv | |
( | |
ulong n, | |
double scale, | |
global const ulong * row, | |
global const ulong * col, | |
global const double * val, | |
global const double * in, | |
global double * out | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong i = chunk_start; i < chunk_end; ++i) | |
{ | |
double sum = 0; | |
for(size_t j = row[i], e = row[i + 1]; j < e; ++j) | |
{ | |
sum += val[j] * in[col[j]]; | |
} | |
out[i] += scale * sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2[idx]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong MAX_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 > prm2 ? prm1 : prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = MAX_ulong(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_expr_1, | |
global int * prm_2_slice_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[prm_2_slice_1[idx]]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void csr_spmv | |
( | |
ulong n, | |
double scale, | |
global const uint * row, | |
global const int * col, | |
global const double * val, | |
global const double * in, | |
global double * out | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong i = chunk_start; i < chunk_end; ++i) | |
{ | |
double sum = 0; | |
for(size_t j = row[i], e = row[i + 1]; j < e; ++j) | |
{ | |
sum += val[j] * in[col[j]]; | |
} | |
out[i] = scale * sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void csr_spmv | |
( | |
ulong n, | |
double scale, | |
global const uint * row, | |
global const int * col, | |
global const double * val, | |
global const double * in, | |
global double * out | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong i = chunk_start; i < chunk_end; ++i) | |
{ | |
double sum = 0; | |
for(size_t j = row[i], e = row[i + 1]; j < e; ++j) | |
{ | |
sum += val[j] * in[col[j]]; | |
} | |
out[i] += scale * sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double prm_2_spmv | |
( | |
global const ulong * idx, | |
global const ulong * row, | |
global const int * col, | |
global const double * val, | |
global const double * vec, | |
ulong i | |
) | |
{ | |
double sum = 0; | |
for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j) | |
{ | |
sum += val[j] * vec[i + col[j]]; | |
} | |
return sum; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global const ulong * prm_2_idx, | |
global const ulong * prm_2_row, | |
global const int * prm_2_col, | |
global const double * prm_2_val, | |
global const double * prm_2_vec | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_spmv(prm_2_idx, prm_2_row, prm_2_col, prm_2_val, prm_2_vec, idx); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double prm_3_spmv | |
( | |
global const ulong * idx, | |
global const ulong * row, | |
global const int * col, | |
global const double * val, | |
global const double * vec, | |
ulong i | |
) | |
{ | |
double sum = 0; | |
for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j) | |
{ | |
sum += val[j] * vec[i + col[j]]; | |
} | |
return sum; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
global const ulong * prm_3_idx, | |
global const ulong * prm_3_row, | |
global const int * prm_3_col, | |
global const double * prm_3_val, | |
global const double * prm_3_vec | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3_spmv(prm_3_idx, prm_3_row, prm_3_col, prm_3_val, prm_3_vec, idx) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double prm_2_csr_spmv | |
( | |
global const ulong * row, | |
global const ulong * col, | |
global const double * val, | |
global const double * in, | |
ulong i | |
) | |
{ | |
double sum = 0; | |
for(size_t j = row[i], e = row[i + 1]; j < e; ++j) | |
{ | |
sum += val[j] * in[col[j]]; | |
} | |
return sum; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global const ulong * prm_2_row, | |
global const ulong * prm_2_col, | |
global const double * prm_2_val, | |
global const double * prm_2_vec | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = sin( prm_2_csr_spmv(prm_2_row, prm_2_col, prm_2_val, prm_2_vec, idx) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2[idx]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double prm_2_csr_spmv | |
( | |
global const ulong * row, | |
global const ulong * col, | |
global const double * val, | |
global const double * in, | |
ulong i | |
) | |
{ | |
double sum = 0; | |
for(size_t j = row[i], e = row[i + 1]; j < e; ++j) | |
{ | |
sum += val[j] * in[col[j]]; | |
} | |
return sum; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global const ulong * prm_2_row, | |
global const ulong * prm_2_col, | |
global const double * prm_2_val, | |
global const double * prm_2_vec | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = cos( prm_2_csr_spmv(prm_2_row, prm_2_col, prm_2_val, prm_2_vec, idx) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double prm_2_spmv | |
( | |
global const ulong * idx, | |
global const ulong * row, | |
global const int * col, | |
global const double * val, | |
global const double * vec, | |
ulong i | |
) | |
{ | |
double sum = 0; | |
for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j) | |
{ | |
sum += val[j] * vec[i + col[j]]; | |
} | |
return sum; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global const ulong * prm_2_idx, | |
global const ulong * prm_2_row, | |
global const int * prm_2_col, | |
global const double * prm_2_val, | |
global const double * prm_2_vec | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_spmv(prm_2_idx, prm_2_row, prm_2_col, prm_2_val, prm_2_vec, idx); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double prm_3_spmv | |
( | |
global const ulong * idx, | |
global const ulong * row, | |
global const int * col, | |
global const double * val, | |
global const double * vec, | |
ulong i | |
) | |
{ | |
double sum = 0; | |
for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j) | |
{ | |
sum += val[j] * vec[i + col[j]]; | |
} | |
return sum; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
global const ulong * prm_3_idx, | |
global const ulong * prm_3_row, | |
global const int * prm_3_col, | |
global const double * prm_3_val, | |
global const double * prm_3_vec | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3_spmv(prm_3_idx, prm_3_row, prm_3_col, prm_3_val, prm_3_vec, idx) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double2 * prm_1, | |
global double2 * prm_2_expr_1, | |
global ulong * prm_2_slice_1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_expr_1[prm_2_slice_1[idx]]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void csr_spmv | |
( | |
ulong n, | |
double scale, | |
global const ulong * row, | |
global const ulong * col, | |
global const double2 * val, | |
global const double2 * in, | |
global double2 * out | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong i = chunk_start; i < chunk_end; ++i) | |
{ | |
double2 sum = 0; | |
for(size_t j = row[i], e = row[i + 1]; j < e; ++j) | |
{ | |
sum += val[j] * in[col[j]]; | |
} | |
out[i] = scale * sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void csr_spmv | |
( | |
ulong n, | |
double scale, | |
global const ulong * row, | |
global const ulong * col, | |
global const double2 * val, | |
global const double2 * in, | |
global double2 * out | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong i = chunk_start; i < chunk_end; ++i) | |
{ | |
double2 sum = 0; | |
for(size_t j = row[i], e = row[i + 1]; j < e; ++j) | |
{ | |
sum += val[j] * in[col[j]]; | |
} | |
out[i] += scale * sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double2 prm_2_spmv | |
( | |
global const ulong * idx, | |
global const ulong * row, | |
global const int * col, | |
global const double2 * val, | |
global const double2 * vec, | |
ulong i | |
) | |
{ | |
double2 sum = 0; | |
for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j) | |
{ | |
sum += val[j] * vec[i + col[j]]; | |
} | |
return sum; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double2 * prm_1, | |
global const ulong * prm_2_idx, | |
global const ulong * prm_2_row, | |
global const int * prm_2_col, | |
global const double2 * prm_2_val, | |
global const double2 * prm_2_vec | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2_spmv(prm_2_idx, prm_2_row, prm_2_col, prm_2_val, prm_2_vec, idx); | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.82 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"spmv" end time: Jan 30 11:27 IST | |
"spmv" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
17/30 Testing: stencil | |
17/30 Test: stencil | |
Command: "/tmp/vexcl/build/tests/stencil" | |
Directory: /tmp/vexcl/build/tests | |
"stencil" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597472 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 7 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double read_x | |
( | |
long g_id, | |
ulong n, | |
char has_left, | |
char has_right, | |
int lhalo, | |
int rhalo, | |
global const double * xloc, | |
global const double * xrem | |
) | |
{ | |
if (g_id >= 0 && g_id < n) | |
{ | |
return xloc[g_id]; | |
} | |
else if (g_id < 0) | |
{ | |
if (has_left) return (lhalo + g_id >= 0) ? xrem[lhalo + g_id] : 0; | |
else return xloc[0]; | |
} | |
else | |
{ | |
if (has_right) return (g_id < n + rhalo) ? xrem[lhalo + g_id - n] : 0; | |
else return xloc[n - 1]; | |
} | |
} | |
kernel void fast_conv | |
( | |
ulong n, | |
char has_left, | |
char has_right, | |
int lhalo, | |
int rhalo, | |
global const double * s, | |
global const double * xloc, | |
global const double * xrem, | |
global double * y, | |
double alpha, | |
double beta, | |
local double * smem | |
) | |
{ | |
local double * S = smem; | |
local double * X = smem + lhalo + rhalo + 1; | |
size_t grid_size = get_global_size(0); | |
int l_id = get_local_id(0); | |
int block_size = get_local_size(0); | |
for(int i = l_id; i < rhalo + lhalo + 1; i += block_size) S[i] = s[i]; | |
for(long g_id = get_global_id(0), pos = 0; pos < n; g_id += grid_size, pos += grid_size) | |
{ | |
for(int i = l_id, j = g_id - lhalo; i < block_size + lhalo + rhalo; i += block_size, j += block_size) | |
{ | |
X[i] = read_x(j, n, has_left, has_right, lhalo, rhalo, xloc, xrem); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (g_id < n) | |
{ | |
double sum = 0; | |
for(int j = -lhalo; j <= rhalo; j++) | |
{ | |
sum += S[lhalo + j] * X[lhalo + l_id + j]; | |
} | |
if (alpha) y[g_id] = alpha * y[g_id] + beta * sum; | |
else y[g_id] = beta * sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double read_x | |
( | |
long g_id, | |
ulong n, | |
char has_left, | |
char has_right, | |
int lhalo, | |
int rhalo, | |
global const double * xloc, | |
global const double * xrem | |
) | |
{ | |
if (g_id >= 0 && g_id < n) | |
{ | |
return xloc[g_id]; | |
} | |
else if (g_id < 0) | |
{ | |
if (has_left) return (lhalo + g_id >= 0) ? xrem[lhalo + g_id] : 0; | |
else return xloc[0]; | |
} | |
else | |
{ | |
if (has_right) return (g_id < n + rhalo) ? xrem[lhalo + g_id - n] : 0; | |
else return xloc[n - 1]; | |
} | |
} | |
kernel void slow_conv | |
( | |
ulong n, | |
char has_left, | |
char has_right, | |
int lhalo, | |
int rhalo, | |
global const double * s, | |
global const double * xloc, | |
global const double * xrem, | |
global double * y, | |
double alpha, | |
double beta | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double sum = 0; | |
for(int j = -lhalo; j <= rhalo; j++) | |
{ | |
sum += s[lhalo + j] * read_x((long)idx + j, n, has_left, has_right, lhalo, rhalo, xloc, xrem); | |
} | |
if (alpha) y[idx] = alpha * y[idx] + beta * sum; | |
else y[idx] = beta * sum; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double read_x | |
( | |
long g_id, | |
ulong n, | |
char has_left, | |
char has_right, | |
int lhalo, | |
int rhalo, | |
global const double * xloc, | |
global const double * xrem | |
) | |
{ | |
if (g_id >= 0 && g_id < n) | |
{ | |
return xloc[g_id]; | |
} | |
else if (g_id < 0) | |
{ | |
if (has_left) return (lhalo + g_id >= 0) ? xrem[lhalo + g_id] : 0; | |
else return xloc[0]; | |
} | |
else | |
{ | |
if (has_right) return (g_id < n + rhalo) ? xrem[lhalo + g_id - n] : 0; | |
else return xloc[n - 1]; | |
} | |
} | |
double stencil_oper | |
( | |
local const double * X | |
) | |
{ | |
return sin(X[1] - X[0]) + sin(X[0] - X[-1]); | |
} | |
kernel void convolve | |
( | |
ulong n, | |
char has_left, | |
char has_right, | |
int lhalo, | |
int rhalo, | |
global const double * xloc, | |
global const double * xrem, | |
global double * y, | |
double alpha, | |
double beta, | |
local double * smem | |
) | |
{ | |
local double * X = smem; | |
size_t grid_size = get_global_size(0); | |
int l_id = get_local_id(0); | |
int block_size = get_local_size(0); | |
for(long g_id = get_global_id(0), pos = 0; pos < n; g_id += grid_size, pos += grid_size) | |
{ | |
for(int i = l_id, j = g_id - lhalo; i < block_size + lhalo + rhalo; i += block_size, j += block_size) | |
{ | |
X[i] = read_x(j, n, has_left, has_right, lhalo, rhalo, xloc, xrem); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (g_id < n) | |
{ | |
double sum = stencil_oper(X + lhalo + l_id); | |
if (alpha) y[g_id] = alpha * y[g_id] + beta * sum; | |
else y[g_id] = beta * sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.24 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"stencil" end time: Jan 30 11:27 IST | |
"stencil" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
18/30 Testing: generator | |
18/30 Test: generator | |
Command: "/tmp/vexcl/build/tests/generator" | |
Directory: /tmp/vexcl/build/tests | |
"generator" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597472 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 7 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void rk2_stepper | |
( | |
ulong n, | |
global double * p_var1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ double var1 = p_var1[idx]; | |
double var2 = sin( var1 ); | |
double var3 = ( 1.000000000000e-02 * var2 ); | |
double var4 = ( var1 + ( 5.000000000000e-01 * var3 ) ); | |
double var5 = sin( var4 ); | |
double var6 = ( 1.000000000000e-02 * var5 ); | |
var1 = ( var1 + var6 ); | |
p_var1[idx] = var1; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void rk2_stepper | |
( | |
ulong n, | |
global double * p_var1 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ double var1 = p_var1[idx]; | |
double var2 = sin( var1 ); | |
double var3 = ( 1.000000000000e-02 * var2 ); | |
double var4 = ( var1 + ( 5.000000000000e-01 * var3 ) ); | |
double var5 = sin( var4 ); | |
double var6 = ( 1.000000000000e-02 * var5 ); | |
var1 = ( var1 + var6 ); | |
p_var1[idx] = var1; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double sin2 | |
( | |
double x | |
) | |
{ | |
double s = sin(x); return s * s; | |
} | |
kernel void test_sin2 | |
( | |
ulong n, | |
const global double * p_var7, | |
global double * p_var8 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ double var7 = p_var7[idx]; | |
double var8 = p_var8[idx]; | |
var8 = sin2( var7 ); | |
p_var8[idx] = var8; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double sin2 | |
( | |
double x | |
) | |
{ | |
double s = sin(x); return s * s; | |
} | |
kernel void test_sin2 | |
( | |
ulong n, | |
const global double * p_var7, | |
global double * p_var8 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ double var7 = p_var7[idx]; | |
double var8 = p_var8[idx]; | |
var8 = sin2( var7 ); | |
p_var8[idx] = var8; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double rk2 | |
( | |
double prm1 | |
) | |
{ | |
double var9 = prm1; | |
double var10 = sin( var9 ); | |
double var11 = ( 1.000000000000e-02 * var10 ); | |
double var12 = ( var9 + ( 5.000000000000e-01 * var11 ) ); | |
double var13 = sin( var12 ); | |
double var14 = ( 1.000000000000e-02 * var13 ); | |
var9 = ( var9 + var14 ); | |
return var9; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = rk2( prm_2[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double generated_function_1 | |
( | |
double prm1 | |
) | |
{ | |
double var15 = prm1; | |
double var16 = sin( var15 ); | |
double var17 = ( 1.000000000000e-02 * var16 ); | |
double var18 = ( var15 + ( 5.000000000000e-01 * var17 ) ); | |
double var19 = sin( var18 ); | |
double var20 = ( 1.000000000000e-02 * var19 ); | |
var15 = ( var15 + var20 ); | |
return var15; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = generated_function_1( prm_2[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double generated_function_1 | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
double var21 = prm1; | |
double var22 = prm2; | |
double var23 = ( ( var21 * var21 ) + ( var22 * var22 ) ); | |
return var23; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
global double * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = generated_function_1( prm_2[idx], prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_tag_1_1, | |
double prm_tag_2_1, | |
double prm_5 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_tag_1_1[idx] + ( prm_tag_2_1 * sin( ( prm_tag_1_1[idx] + ( prm_5 * ( prm_tag_2_1 * sin( prm_tag_1_1[idx] ) ) ) ) ) ) ); | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.33 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"generator" end time: Jan 30 11:27 IST | |
"generator" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
19/30 Testing: mba | |
19/30 Test: mba | |
Command: "/tmp/vexcl/build/tests/mba" | |
Directory: /tmp/vexcl/build/tests | |
"mba" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597472 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 2 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
double prm_2, | |
ulong prm_3, | |
double prm_4 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( ( prm_2 * (prm_3 + idx) ) / prm_4 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double prm_2_B0 | |
( | |
double t | |
) | |
{ | |
return (t * (t * (-t + 3) - 3) + 1) / 6; | |
} | |
double prm_2_B1 | |
( | |
double t | |
) | |
{ | |
return (t * t * (3 * t - 6) + 4) / 6; | |
} | |
double prm_2_B2 | |
( | |
double t | |
) | |
{ | |
return (t * (t * (-3 * t + 3) + 3) + 1) / 6; | |
} | |
double prm_2_B3 | |
( | |
double t | |
) | |
{ | |
return t * t * t / 6; | |
} | |
double prm_2_mba | |
( | |
double x0, | |
double x1, | |
double c0, | |
double h0, | |
ulong n0, | |
ulong m0, | |
double c1, | |
double h1, | |
ulong n1, | |
ulong m1, | |
global const double * phi | |
) | |
{ | |
double u; | |
u = (x0 - c0) * h0; | |
ulong i0 = floor(u) - 1; | |
double s0 = u - floor(u); | |
u = (x1 - c1) * h1; | |
ulong i1 = floor(u) - 1; | |
double s1 = u - floor(u); | |
double f = 0; | |
ulong j, idx; | |
idx = 0; | |
j = i0 + 0; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 0; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B0(s0) * prm_2_B0(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 0; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 1; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B0(s0) * prm_2_B1(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 0; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 2; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B0(s0) * prm_2_B2(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 0; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 3; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B0(s0) * prm_2_B3(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 1; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 0; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B1(s0) * prm_2_B0(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 1; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 1; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B1(s0) * prm_2_B1(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 1; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 2; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B1(s0) * prm_2_B2(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 1; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 3; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B1(s0) * prm_2_B3(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 2; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 0; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B2(s0) * prm_2_B0(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 2; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 1; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B2(s0) * prm_2_B1(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 2; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 2; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B2(s0) * prm_2_B2(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 2; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 3; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B2(s0) * prm_2_B3(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 3; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 0; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B3(s0) * prm_2_B0(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 3; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 1; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B3(s0) * prm_2_B1(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 3; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 2; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B3(s0) * prm_2_B2(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 3; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 3; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B3(s0) * prm_2_B3(s1) * phi[idx]; | |
} | |
} | |
return f; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2_x0_1, | |
double prm_2_x1_1, | |
ulong prm_2_x1_2, | |
double prm_2_x1_3, | |
double prm_2_c0, | |
double prm_2_h0, | |
ulong prm_2_n0, | |
ulong prm_2_m0, | |
double prm_2_c1, | |
double prm_2_h1, | |
ulong prm_2_n1, | |
ulong prm_2_m1, | |
global const double * prm_2_phi | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = sin( prm_2_mba(prm_2_x0_1[idx], ( ( prm_2_x1_1 * (prm_2_x1_2 + idx) ) / prm_2_x1_3 ), prm_2_c0, prm_2_h0, prm_2_n0, prm_2_m0, prm_2_c1, prm_2_h1, prm_2_n1, prm_2_m1, prm_2_phi) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double prm_2_B0 | |
( | |
double t | |
) | |
{ | |
return (t * (t * (-t + 3) - 3) + 1) / 6; | |
} | |
double prm_2_B1 | |
( | |
double t | |
) | |
{ | |
return (t * t * (3 * t - 6) + 4) / 6; | |
} | |
double prm_2_B2 | |
( | |
double t | |
) | |
{ | |
return (t * (t * (-3 * t + 3) + 3) + 1) / 6; | |
} | |
double prm_2_B3 | |
( | |
double t | |
) | |
{ | |
return t * t * t / 6; | |
} | |
double prm_2_mba | |
( | |
double x0, | |
double x1, | |
double c0, | |
double h0, | |
ulong n0, | |
ulong m0, | |
double c1, | |
double h1, | |
ulong n1, | |
ulong m1, | |
global const double * phi | |
) | |
{ | |
double u; | |
u = (x0 - c0) * h0; | |
ulong i0 = floor(u) - 1; | |
double s0 = u - floor(u); | |
u = (x1 - c1) * h1; | |
ulong i1 = floor(u) - 1; | |
double s1 = u - floor(u); | |
double f = 0; | |
ulong j, idx; | |
idx = 0; | |
j = i0 + 0; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 0; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B0(s0) * prm_2_B0(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 0; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 1; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B0(s0) * prm_2_B1(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 0; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 2; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B0(s0) * prm_2_B2(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 0; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 3; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B0(s0) * prm_2_B3(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 1; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 0; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B1(s0) * prm_2_B0(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 1; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 1; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B1(s0) * prm_2_B1(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 1; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 2; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B1(s0) * prm_2_B2(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 1; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 3; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B1(s0) * prm_2_B3(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 2; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 0; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B2(s0) * prm_2_B0(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 2; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 1; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B2(s0) * prm_2_B1(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 2; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 2; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B2(s0) * prm_2_B2(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 2; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 3; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B2(s0) * prm_2_B3(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 3; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 0; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B3(s0) * prm_2_B0(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 3; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 1; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B3(s0) * prm_2_B1(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 3; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 2; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B3(s0) * prm_2_B2(s1) * phi[idx]; | |
} | |
} | |
idx = 0; | |
j = i0 + 3; | |
if (j < n0) | |
{ | |
idx += j * m0; | |
j = i1 + 3; | |
if (j < n1) | |
{ | |
idx += j * m1; | |
f += prm_2_B3(s0) * prm_2_B3(s1) * phi[idx]; | |
} | |
} | |
return f; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
double prm_2_x0_1_1, | |
ulong prm_2_x0_1_2, | |
double prm_2_x0_1_3, | |
double prm_2_c0, | |
double prm_2_h0, | |
ulong prm_2_n0, | |
ulong prm_2_m0, | |
double prm_2_c1, | |
double prm_2_h1, | |
ulong prm_2_n1, | |
ulong prm_2_m1, | |
global const double * prm_2_phi | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double temp_1 = ( ( prm_2_x0_1_1 * (prm_2_x0_1_2 + idx) ) / prm_2_x0_1_3 ); | |
prm_1[idx] = prm_2_mba(temp_1, temp_1, prm_2_c0, prm_2_h0, prm_2_n0, prm_2_m0, prm_2_c1, prm_2_h1, prm_2_n1, prm_2_m1, prm_2_phi); | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.31 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"mba" end time: Jan 30 11:27 IST | |
"mba" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
20/30 Testing: random | |
20/30 Test: random | |
Command: "/tmp/vexcl/build/tests/random" | |
Directory: /tmp/vexcl/build/tests | |
"random" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597473 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 3 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
void philox_uint_2_10 | |
( | |
uint * ctr, | |
uint * key | |
) | |
{ | |
uint m[2]; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
} | |
int random_int_philox | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
union | |
{ | |
uint ctr[2]; | |
int res; | |
} ctr; | |
uint key[1]; | |
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; | |
key[0] = 0x12345678; | |
philox_uint_2_10(ctr.ctr, key); | |
return ctr.res; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global uint * prm_1, | |
ulong prm_2, | |
int prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = random_int_philox( (prm_2 + idx), prm_3 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
void philox_uint_4_10 | |
( | |
uint * ctr, | |
uint * key | |
) | |
{ | |
uint m[4]; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
} | |
float4 random_float4_philox | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
union | |
{ | |
uint ctr[4]; | |
uint res_i[4]; | |
float res_f[4]; | |
float4 res; | |
} ctr; | |
uint key[2]; | |
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; | |
ctr.ctr[2] = prm1; ctr.ctr[3] = prm2; | |
key[0] = 0x12345678; | |
key[1] = 0x12345678; | |
philox_uint_4_10(ctr.ctr, key); | |
ctr.res_f[0] = ctr.res_i[0] / 4294967295.0f; | |
ctr.res_f[1] = ctr.res_i[1] / 4294967295.0f; | |
ctr.res_f[2] = ctr.res_i[2] / 4294967295.0f; | |
ctr.res_f[3] = ctr.res_i[3] / 4294967295.0f; | |
return ctr.res; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float4 * prm_1, | |
ulong prm_2, | |
int prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = random_float4_philox( (prm_2 + idx), prm_3 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
void philox_ulong_4_10 | |
( | |
ulong * ctr, | |
ulong * key | |
) | |
{ | |
ulong m[4]; | |
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]); | |
m[1] = 0xD2E7470EE14C6C93 * ctr[0]; | |
m[2] = mul_hi(0xCA5A826395121157, ctr[2]); | |
m[3] = 0xCA5A826395121157 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B97F4A7C15; | |
key[1] += 0xBB67AE8584CAA73B; | |
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]); | |
m[1] = 0xD2E7470EE14C6C93 * ctr[0]; | |
m[2] = mul_hi(0xCA5A826395121157, ctr[2]); | |
m[3] = 0xCA5A826395121157 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B97F4A7C15; | |
key[1] += 0xBB67AE8584CAA73B; | |
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]); | |
m[1] = 0xD2E7470EE14C6C93 * ctr[0]; | |
m[2] = mul_hi(0xCA5A826395121157, ctr[2]); | |
m[3] = 0xCA5A826395121157 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B97F4A7C15; | |
key[1] += 0xBB67AE8584CAA73B; | |
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]); | |
m[1] = 0xD2E7470EE14C6C93 * ctr[0]; | |
m[2] = mul_hi(0xCA5A826395121157, ctr[2]); | |
m[3] = 0xCA5A826395121157 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B97F4A7C15; | |
key[1] += 0xBB67AE8584CAA73B; | |
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]); | |
m[1] = 0xD2E7470EE14C6C93 * ctr[0]; | |
m[2] = mul_hi(0xCA5A826395121157, ctr[2]); | |
m[3] = 0xCA5A826395121157 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B97F4A7C15; | |
key[1] += 0xBB67AE8584CAA73B; | |
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]); | |
m[1] = 0xD2E7470EE14C6C93 * ctr[0]; | |
m[2] = mul_hi(0xCA5A826395121157, ctr[2]); | |
m[3] = 0xCA5A826395121157 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B97F4A7C15; | |
key[1] += 0xBB67AE8584CAA73B; | |
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]); | |
m[1] = 0xD2E7470EE14C6C93 * ctr[0]; | |
m[2] = mul_hi(0xCA5A826395121157, ctr[2]); | |
m[3] = 0xCA5A826395121157 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B97F4A7C15; | |
key[1] += 0xBB67AE8584CAA73B; | |
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]); | |
m[1] = 0xD2E7470EE14C6C93 * ctr[0]; | |
m[2] = mul_hi(0xCA5A826395121157, ctr[2]); | |
m[3] = 0xCA5A826395121157 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B97F4A7C15; | |
key[1] += 0xBB67AE8584CAA73B; | |
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]); | |
m[1] = 0xD2E7470EE14C6C93 * ctr[0]; | |
m[2] = mul_hi(0xCA5A826395121157, ctr[2]); | |
m[3] = 0xCA5A826395121157 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B97F4A7C15; | |
key[1] += 0xBB67AE8584CAA73B; | |
m[0] = mul_hi(0xD2E7470EE14C6C93, ctr[0]); | |
m[1] = 0xD2E7470EE14C6C93 * ctr[0]; | |
m[2] = mul_hi(0xCA5A826395121157, ctr[2]); | |
m[3] = 0xCA5A826395121157 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
} | |
double4 random_double4_philox | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
union | |
{ | |
ulong ctr[4]; | |
ulong res_i[4]; | |
double res_f[4]; | |
double4 res; | |
} ctr; | |
ulong key[2]; | |
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; | |
ctr.ctr[2] = prm1; ctr.ctr[3] = prm2; | |
key[0] = 0x12345678; | |
key[1] = 0x12345678; | |
philox_ulong_4_10(ctr.ctr, key); | |
ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0; | |
ctr.res_f[1] = ctr.res_i[1] / 18446744073709551615.0; | |
ctr.res_f[2] = ctr.res_i[2] / 18446744073709551615.0; | |
ctr.res_f[3] = ctr.res_i[3] / 18446744073709551615.0; | |
return ctr.res; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double4 * prm_1, | |
ulong prm_2, | |
int prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = random_double4_philox( (prm_2 + idx), prm_3 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
void philox_uint_2_10 | |
( | |
uint * ctr, | |
uint * key | |
) | |
{ | |
uint m[2]; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
} | |
double random_double_philox | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
union | |
{ | |
uint ctr[2]; | |
ulong res_i[1]; | |
double res_f[1]; | |
double res; | |
} ctr; | |
uint key[1]; | |
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; | |
key[0] = 0x12345678; | |
philox_uint_2_10(ctr.ctr, key); | |
ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0; | |
return ctr.res; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
ulong prm_2, | |
int prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = random_double_philox( (prm_2 + idx), prm_3 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong SUM_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_ulong(mySum, ( prm_1[idx] > prm_2 )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong SUM_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
int prm_2, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_ulong(mySum, ( prm_1[idx] < prm_2 )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double SUM_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_double(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
void philox_uint_4_10 | |
( | |
uint * ctr, | |
uint * key | |
) | |
{ | |
uint m[4]; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
key[0] += 0x9E3779B9; | |
key[1] += 0xBB67AE85; | |
m[0] = mul_hi(0xD2511F53, ctr[0]); | |
m[1] = 0xD2511F53 * ctr[0]; | |
m[2] = mul_hi(0xCD9E8D57, ctr[2]); | |
m[3] = 0xCD9E8D57 * ctr[2]; | |
ctr[0] = m[2] ^ ctr[1] ^ key[0]; | |
ctr[1] = m[3]; | |
ctr[2] = m[0] ^ ctr[3] ^ key[1]; | |
ctr[3] = m[1]; | |
} | |
double random_normal_double_philox | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
union | |
{ | |
uint ctr[4]; | |
ulong res_i[2]; | |
} ctr; | |
double u[2]; | |
uint key[2]; | |
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; | |
ctr.ctr[2] = prm1; ctr.ctr[3] = prm2; | |
key[0] = 0x12345678; | |
key[1] = 0x12345678; | |
philox_uint_4_10(ctr.ctr, key); | |
u[0] = ctr.res_i[0] / 18446744073709551615.0; | |
u[1] = ctr.res_i[1] / 18446744073709551615.0; | |
return sqrt(-2 * log(u[0])) * cospi(2 * u[1]); | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
ulong prm_2, | |
int prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = random_normal_double_philox( (prm_2 + idx), prm_3 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double SUM_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_double(mySum, fabs( prm_1[idx] )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
void threefry_uint_2_20 | |
( | |
uint * ctr, | |
uint * key | |
) | |
{ | |
const uint p = 0x1BD11BDA ^ key[0] ^ key[1]; | |
ctr[0] += key[0]; | |
ctr[1] += key[1]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0]; | |
ctr[0] += key[1]; | |
ctr[1] += p; ctr[1] += 1; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 17u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 29u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 16u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 24u); ctr[1] ^= ctr[0]; | |
ctr[0] += p; | |
ctr[1] += key[0]; ctr[1] += 2; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0]; | |
ctr[0] += key[0]; | |
ctr[1] += key[1]; ctr[1] += 3; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 17u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 29u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 16u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 24u); ctr[1] ^= ctr[0]; | |
ctr[0] += key[1]; | |
ctr[1] += p; ctr[1] += 4; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0]; | |
ctr[0] += p; | |
ctr[1] += key[0]; ctr[1] += 5; | |
} | |
double random_double_threefry | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
union | |
{ | |
uint ctr[2]; | |
ulong res_i[1]; | |
double res_f[1]; | |
double res; | |
} ctr; | |
uint key[2]; | |
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; | |
key[0] = 0x12345678; | |
key[1] = 0x12345678; | |
threefry_uint_2_20(ctr.ctr, key); | |
ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0; | |
return ctr.res; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
ulong prm_2, | |
int prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = random_double_threefry( (prm_2 + idx), prm_3 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
ulong SUM_ulong | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
void threefry_uint_2_20 | |
( | |
uint * ctr, | |
uint * key | |
) | |
{ | |
const uint p = 0x1BD11BDA ^ key[0] ^ key[1]; | |
ctr[0] += key[0]; | |
ctr[1] += key[1]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0]; | |
ctr[0] += key[1]; | |
ctr[1] += p; ctr[1] += 1; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 17u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 29u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 16u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 24u); ctr[1] ^= ctr[0]; | |
ctr[0] += p; | |
ctr[1] += key[0]; ctr[1] += 2; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0]; | |
ctr[0] += key[0]; | |
ctr[1] += key[1]; ctr[1] += 3; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 17u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 29u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 16u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 24u); ctr[1] ^= ctr[0]; | |
ctr[0] += key[1]; | |
ctr[1] += p; ctr[1] += 4; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 15u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 26u); ctr[1] ^= ctr[0]; | |
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0]; | |
ctr[0] += p; | |
ctr[1] += key[0]; ctr[1] += 5; | |
} | |
double random_double_threefry | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
union | |
{ | |
uint ctr[2]; | |
ulong res_i[1]; | |
double res_f[1]; | |
double res; | |
} ctr; | |
uint key[2]; | |
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; | |
key[0] = 0x12345678; | |
key[1] = 0x12345678; | |
threefry_uint_2_20(ctr.ctr, key); | |
ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0; | |
return ctr.res; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
ulong prm_tag_0_1, | |
int prm_1_2, | |
int prm_3_2, | |
int prm_5, | |
global ulong * g_odata | |
) | |
{ | |
ulong mySum = (ulong)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
double temp_1 = random_double_threefry( (prm_tag_0_1 + idx), prm_1_2 ); | |
double temp_2 = random_double_threefry( (prm_tag_0_1 + idx), prm_3_2 ); | |
mySum = SUM_ulong(mySum, ( ( ( temp_1 * temp_1 ) + ( temp_2 * temp_2 ) ) < prm_5 )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.73 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"random" end time: Jan 30 11:27 IST | |
"random" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
21/30 Testing: sort | |
21/30 Test: sort | |
Command: "/tmp/vexcl/build/tests/sort" | |
Directory: /tmp/vexcl/build/tests | |
"sort" start time: Jan 30 11:27 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597473 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 6 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
float x, | |
float y | |
) | |
{ | |
return x < y; | |
} | |
void global_to_regstr_pred_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_float(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_float | |
( | |
int count, | |
const float * reg, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_float | |
( | |
int count, | |
global const float * source, | |
int tid, | |
local float * dest | |
) | |
{ | |
float reg[11]; | |
global_to_regstr_1_11_float(count, source, tid, reg); | |
regstr_to_shared_1_11_float(reg, tid, dest); | |
} | |
void shared_to_global_1_11_float | |
( | |
int count, | |
local const float * source, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_int(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_int | |
( | |
int count, | |
const int * reg, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_int | |
( | |
int count, | |
global const int * source, | |
int tid, | |
local int * dest | |
) | |
{ | |
int reg[11]; | |
global_to_regstr_1_11_int(count, source, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, dest); | |
} | |
void shared_to_global_1_11_int | |
( | |
int count, | |
local const int * source, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void serial_merge_11_float | |
( | |
int a_begin, | |
int a_end, | |
int b_begin, | |
int b_end, | |
int * indices, | |
local const float * keys_shared0, | |
float * results0 | |
) | |
{ | |
float a_key0 = keys_shared0[a_begin]; | |
float b_key0 = keys_shared0[b_begin]; | |
bool p; | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[0] = p ? a_key0 : b_key0; | |
indices[0] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[1] = p ? a_key0 : b_key0; | |
indices[1] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[2] = p ? a_key0 : b_key0; | |
indices[2] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[3] = p ? a_key0 : b_key0; | |
indices[3] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[4] = p ? a_key0 : b_key0; | |
indices[4] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[5] = p ? a_key0 : b_key0; | |
indices[5] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[6] = p ? a_key0 : b_key0; | |
indices[6] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[7] = p ? a_key0 : b_key0; | |
indices[7] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[8] = p ? a_key0 : b_key0; | |
indices[8] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[9] = p ? a_key0 : b_key0; | |
indices[9] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[10] = p ? a_key0 : b_key0; | |
indices[10] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void swap_float | |
( | |
float * a0, | |
float * b0 | |
) | |
{ | |
{ | |
float c = *a0; | |
*a0 = *b0; | |
*b0 = c; | |
} | |
} | |
void odd_even_transpose_sort_11_float | |
( | |
float * keys0 | |
) | |
{ | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_float(keys0 + 0, keys0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_float(keys0 + 2, keys0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_float(keys0 + 4, keys0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_float(keys0 + 6, keys0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_float(keys0 + 8, keys0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_float(keys0 + 1, keys0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_float(keys0 + 3, keys0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_float(keys0 + 5, keys0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_float(keys0 + 7, keys0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_float(keys0 + 9, keys0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_float(keys0 + 0, keys0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_float(keys0 + 2, keys0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_float(keys0 + 4, keys0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_float(keys0 + 6, keys0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_float(keys0 + 8, keys0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_float(keys0 + 1, keys0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_float(keys0 + 3, keys0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_float(keys0 + 5, keys0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_float(keys0 + 7, keys0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_float(keys0 + 9, keys0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_float(keys0 + 0, keys0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_float(keys0 + 2, keys0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_float(keys0 + 4, keys0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_float(keys0 + 6, keys0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_float(keys0 + 8, keys0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_float(keys0 + 1, keys0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_float(keys0 + 3, keys0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_float(keys0 + 5, keys0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_float(keys0 + 7, keys0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_float(keys0 + 9, keys0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_float(keys0 + 0, keys0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_float(keys0 + 2, keys0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_float(keys0 + 4, keys0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_float(keys0 + 6, keys0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_float(keys0 + 8, keys0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_float(keys0 + 1, keys0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_float(keys0 + 3, keys0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_float(keys0 + 5, keys0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_float(keys0 + 7, keys0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_float(keys0 + 9, keys0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_float(keys0 + 0, keys0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_float(keys0 + 2, keys0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_float(keys0 + 4, keys0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_float(keys0 + 6, keys0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_float(keys0 + 8, keys0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_float(keys0 + 1, keys0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_float(keys0 + 3, keys0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_float(keys0 + 5, keys0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_float(keys0 + 7, keys0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_float(keys0 + 9, keys0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_float(keys0 + 0, keys0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_float(keys0 + 2, keys0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_float(keys0 + 4, keys0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_float(keys0 + 6, keys0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_float(keys0 + 8, keys0 + 9); | |
} | |
} | |
int merge_path_float | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
local const float * a0, | |
local const float * b0 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
void block_sort_pass_1_11_float | |
( | |
int tid, | |
int count, | |
int coop, | |
int * indices, | |
local const float * keys_shared0, | |
float * keys0 | |
) | |
{ | |
int list = ~(coop - 1) & tid; | |
int diag = min(count, 11 * ((coop - 1) & tid)); | |
int start = 11 * list; | |
int a0 = min(count, start); | |
int b0 = min(count, start + 11 * (coop / 2)); | |
int b1 = min(count, start + 11 * coop); | |
int p = merge_path_float(b0 - a0, b1 - b0, diag, keys_shared0 + a0, keys_shared0 + b0); | |
serial_merge_11_float(a0 + p, b0, b0 + diag - p, b1, indices, keys_shared0, keys0); | |
} | |
void block_sort_loop_1_11_float | |
( | |
int tid, | |
int count, | |
local float * keys_shared0 | |
) | |
{ | |
int indices[11]; | |
float keys0[11]; | |
} | |
void mergesort_1_11_float | |
( | |
int count, | |
int tid, | |
float * thread_keys0, | |
local float * keys_shared0 | |
) | |
{ | |
if(11 * tid < count) odd_even_transpose_sort_11_float(thread_keys0); | |
thread_to_shared_11_float(thread_keys0, tid, keys_shared0); | |
block_sort_loop_1_11_float(tid, count, keys_shared0); | |
} | |
kernel void block_sort | |
( | |
int count, | |
global const float * keys_src0, | |
global float * keys_dst0 | |
) | |
{ | |
union Shared | |
{ | |
struct | |
{ | |
float keys0[12]; | |
}; | |
}; | |
local union Shared shared; | |
int tid = get_local_id(0); | |
int block = get_group_id(0); | |
int gid = 11 * block; | |
int count2 = min(11, count - gid); | |
float thread_keys0[11]; | |
global_to_shared_1_11_float(count2, keys_src0 + gid, tid, shared.keys0); | |
shared_to_thread_11_float(shared.keys0, tid, thread_keys0); | |
int first = 11 * tid; | |
if(first + 11 > count2 && first < count2) | |
{ | |
float max_key0 = thread_keys0[0]; | |
if(first + 1 < count2 && comp(max_key0, thread_keys0[1]) ) | |
{ | |
max_key0 = thread_keys0[1]; | |
} | |
if(first + 2 < count2 && comp(max_key0, thread_keys0[2]) ) | |
{ | |
max_key0 = thread_keys0[2]; | |
} | |
if(first + 3 < count2 && comp(max_key0, thread_keys0[3]) ) | |
{ | |
max_key0 = thread_keys0[3]; | |
} | |
if(first + 4 < count2 && comp(max_key0, thread_keys0[4]) ) | |
{ | |
max_key0 = thread_keys0[4]; | |
} | |
if(first + 5 < count2 && comp(max_key0, thread_keys0[5]) ) | |
{ | |
max_key0 = thread_keys0[5]; | |
} | |
if(first + 6 < count2 && comp(max_key0, thread_keys0[6]) ) | |
{ | |
max_key0 = thread_keys0[6]; | |
} | |
if(first + 7 < count2 && comp(max_key0, thread_keys0[7]) ) | |
{ | |
max_key0 = thread_keys0[7]; | |
} | |
if(first + 8 < count2 && comp(max_key0, thread_keys0[8]) ) | |
{ | |
max_key0 = thread_keys0[8]; | |
} | |
if(first + 9 < count2 && comp(max_key0, thread_keys0[9]) ) | |
{ | |
max_key0 = thread_keys0[9]; | |
} | |
if(first + 10 < count2 && comp(max_key0, thread_keys0[10]) ) | |
{ | |
max_key0 = thread_keys0[10]; | |
} | |
if(first + 0 >= count2) | |
{ | |
thread_keys0[0] = max_key0; | |
} | |
if(first + 1 >= count2) | |
{ | |
thread_keys0[1] = max_key0; | |
} | |
if(first + 2 >= count2) | |
{ | |
thread_keys0[2] = max_key0; | |
} | |
if(first + 3 >= count2) | |
{ | |
thread_keys0[3] = max_key0; | |
} | |
if(first + 4 >= count2) | |
{ | |
thread_keys0[4] = max_key0; | |
} | |
if(first + 5 >= count2) | |
{ | |
thread_keys0[5] = max_key0; | |
} | |
if(first + 6 >= count2) | |
{ | |
thread_keys0[6] = max_key0; | |
} | |
if(first + 7 >= count2) | |
{ | |
thread_keys0[7] = max_key0; | |
} | |
if(first + 8 >= count2) | |
{ | |
thread_keys0[8] = max_key0; | |
} | |
if(first + 9 >= count2) | |
{ | |
thread_keys0[9] = max_key0; | |
} | |
if(first + 10 >= count2) | |
{ | |
thread_keys0[10] = max_key0; | |
} | |
} | |
mergesort_1_11_float(count2, tid, thread_keys0, shared.keys0); | |
shared_to_global_1_11_float(count2, shared.keys0, tid, keys_dst0 + gid); | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
float x, | |
float y | |
) | |
{ | |
return x < y; | |
} | |
int4 find_mergesort_frame | |
( | |
int coop, | |
int block, | |
int nv | |
) | |
{ | |
int start = ~(coop - 1) & block; | |
int size = nv * (coop>> 1); | |
int4 frame; | |
frame.x = nv * start; | |
frame.y = nv * start + size; | |
frame.z = size; | |
return frame; | |
} | |
int4 find_mergesort_interval | |
( | |
int4 frame, | |
int coop, | |
int block, | |
int nv, | |
int count, | |
int mp0, | |
int mp1 | |
) | |
{ | |
int diag = nv * block - frame.x; | |
int4 interval; | |
interval.x = frame.x + mp0; | |
interval.y = min(count, frame.x + mp1); | |
interval.z = min(count, frame.y + diag - mp0); | |
interval.w = min(count, frame.y + diag + nv - mp1); | |
if(coop - 1 == ((coop - 1) & block)) | |
{ | |
interval.y = min(count, frame.x + frame.z); | |
interval.w = min(count, frame.y + frame.z); | |
} | |
return interval; | |
} | |
int4 compute_merge_range | |
( | |
int a_count, | |
int b_count, | |
int block, | |
int coop, | |
int nv, | |
global const int * mp_global | |
) | |
{ | |
int mp0 = mp_global[block]; | |
int mp1 = mp_global[block + 1]; | |
int gid = nv * block; | |
int4 range; | |
if(coop) | |
{ | |
int4 frame = find_mergesort_frame(coop, block, nv); | |
range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1); | |
} | |
else | |
{ | |
range.x = mp0; | |
range.y = mp1; | |
range.z = gid - range.x; | |
range.w = min(a_count + b_count, gid + nv) - range.y; | |
} | |
return range; | |
} | |
void global_to_regstr_pred_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_float(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_float | |
( | |
int count, | |
const float * reg, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_float | |
( | |
int count, | |
global const float * source, | |
int tid, | |
local float * dest | |
) | |
{ | |
float reg[11]; | |
global_to_regstr_1_11_float(count, source, tid, reg); | |
regstr_to_shared_1_11_float(reg, tid, dest); | |
} | |
void shared_to_global_1_11_float | |
( | |
int count, | |
local const float * source, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_int(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_int | |
( | |
int count, | |
const int * reg, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_int | |
( | |
int count, | |
global const int * source, | |
int tid, | |
local int * dest | |
) | |
{ | |
int reg[11]; | |
global_to_regstr_1_11_int(count, source, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, dest); | |
} | |
void shared_to_global_1_11_int | |
( | |
int count, | |
local const int * source, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void serial_merge_11_float | |
( | |
int a_begin, | |
int a_end, | |
int b_begin, | |
int b_end, | |
int * indices, | |
local const float * keys_shared0, | |
float * results0 | |
) | |
{ | |
float a_key0 = keys_shared0[a_begin]; | |
float b_key0 = keys_shared0[b_begin]; | |
bool p; | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[0] = p ? a_key0 : b_key0; | |
indices[0] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[1] = p ? a_key0 : b_key0; | |
indices[1] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[2] = p ? a_key0 : b_key0; | |
indices[2] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[3] = p ? a_key0 : b_key0; | |
indices[3] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[4] = p ? a_key0 : b_key0; | |
indices[4] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[5] = p ? a_key0 : b_key0; | |
indices[5] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[6] = p ? a_key0 : b_key0; | |
indices[6] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[7] = p ? a_key0 : b_key0; | |
indices[7] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[8] = p ? a_key0 : b_key0; | |
indices[8] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[9] = p ? a_key0 : b_key0; | |
indices[9] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[10] = p ? a_key0 : b_key0; | |
indices[10] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
int merge_path_float | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
local const float * a0, | |
local const float * b0 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
void load2_to_regstr_1_11_11_float | |
( | |
global const float * a_global, | |
int a_count, | |
global const float * b_global, | |
int b_count, | |
int tid, | |
float * reg | |
) | |
{ | |
b_global -= a_count; | |
int total = a_count + b_count; | |
int index; | |
if (total >= 11) | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else reg[6] = b_global[index]; | |
index = 7 + tid; | |
if (index < a_count) reg[7] = a_global[index]; | |
else reg[7] = b_global[index]; | |
index = 8 + tid; | |
if (index < a_count) reg[8] = a_global[index]; | |
else reg[8] = b_global[index]; | |
index = 9 + tid; | |
if (index < a_count) reg[9] = a_global[index]; | |
else reg[9] = b_global[index]; | |
index = 10 + tid; | |
if (index < a_count) reg[10] = a_global[index]; | |
else reg[10] = b_global[index]; | |
} | |
else | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else if (index < total) reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else if (index < total) reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else if (index < total) reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else if (index < total) reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else if (index < total) reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else if (index < total) reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else if (index < total) reg[6] = b_global[index]; | |
index = 7 + tid; | |
if (index < a_count) reg[7] = a_global[index]; | |
else if (index < total) reg[7] = b_global[index]; | |
index = 8 + tid; | |
if (index < a_count) reg[8] = a_global[index]; | |
else if (index < total) reg[8] = b_global[index]; | |
index = 9 + tid; | |
if (index < a_count) reg[9] = a_global[index]; | |
else if (index < total) reg[9] = b_global[index]; | |
index = 10 + tid; | |
if (index < a_count) reg[10] = a_global[index]; | |
else if (index < total) reg[10] = b_global[index]; | |
} | |
} | |
void load2_to_shared_1_11_11_float | |
( | |
global const float * a_global, | |
int a_count, | |
global const float * b_global, | |
int b_count, | |
int tid, | |
local float * shared | |
) | |
{ | |
float reg[11]; | |
load2_to_regstr_1_11_11_float(a_global, a_count, b_global, b_count, tid, reg); | |
regstr_to_shared_1_11_float(reg, tid, shared); | |
} | |
void merge_keys_indices_1_11_float | |
( | |
int a_count, | |
int b_count, | |
int4 range, | |
int tid, | |
int * indices, | |
global const float * a_global0, | |
global const float * b_global0, | |
local float * keys_shared0, | |
float * results0 | |
) | |
{ | |
int a0 = range.x; | |
int a1 = range.y; | |
int b0 = range.z; | |
int b1 = range.w; | |
a_count = a1 - a0; | |
b_count = b1 - b0; | |
load2_to_shared_1_11_11_float(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0); | |
int diag = 11 * tid; | |
int mp = merge_path_float(a_count, b_count, diag, keys_shared0, keys_shared0 + a_count); | |
int a0tid = mp; | |
int a1tid = a_count; | |
int b0tid = a_count + diag - mp; | |
int b1tid = a_count + b_count; | |
serial_merge_11_float(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, results0); | |
} | |
void device_merge_1_11_float | |
( | |
int a_count, | |
int b_count, | |
global const float * a_keys_global0, | |
global const float * b_keys_global0, | |
global float * keys_global0, | |
local float * keys_shared0, | |
int tid, | |
int block, | |
int4 range, | |
local int * indices_shared | |
) | |
{ | |
float results0[11]; | |
int indices[11]; | |
merge_keys_indices_1_11_float(a_count, b_count, range, tid, indices, a_keys_global0, b_keys_global0, keys_shared0, results0); | |
thread_to_shared_11_float(results0, tid, keys_shared0); | |
a_count = range.y - range.x; | |
b_count = range.w - range.z; | |
shared_to_global_1_11_float(a_count + b_count, keys_shared0, tid, keys_global0 + 11 * block); | |
} | |
kernel void merge | |
( | |
int a_count, | |
int b_count, | |
global const float * a_keys_global0, | |
global const float * b_keys_global0, | |
global float * keys_global0, | |
global const int * mp_global, | |
int coop | |
) | |
{ | |
union Shared | |
{ | |
struct | |
{ | |
float keys0[12]; | |
}; | |
int indices[11]; | |
}; | |
local union Shared shared; | |
int tid = get_local_id(0); | |
int block = get_group_id(0); | |
int4 range = compute_merge_range(a_count, b_count, block, coop, 11, mp_global); | |
device_merge_1_11_float(a_count, b_count, a_keys_global0, b_keys_global0, keys_global0, shared.keys0, tid, block, range, shared.indices); | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
float x, | |
float y | |
) | |
{ | |
return x < y; | |
} | |
int merge_path_float | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
global const float * a0, | |
global const float * b0 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
int4 find_mergesort_frame | |
( | |
int coop, | |
int block, | |
int nv | |
) | |
{ | |
int start = ~(coop - 1) & block; | |
int size = nv * (coop>> 1); | |
int4 frame; | |
frame.x = nv * start; | |
frame.y = nv * start + size; | |
frame.z = size; | |
return frame; | |
} | |
kernel void merge_partition | |
( | |
int a_count, | |
int b_count, | |
int nv, | |
int coop, | |
global int * mp_global, | |
int num_searches, | |
global const float * a_global0, | |
global const float * b_global0 | |
) | |
{ | |
int partition = get_global_id(0); | |
if (partition < num_searches) | |
{ | |
int a0 = 0, b0 = 0; | |
int gid = nv * partition; | |
if(coop) | |
{ | |
int4 frame = find_mergesort_frame(coop, partition, nv); | |
a0 = frame.x; | |
b0 = min(a_count, frame.y); | |
b_count = min(a_count, frame.y + frame.z) - b0; | |
a_count = min(a_count, frame.x + frame.z) - a0; | |
gid -= a0; | |
} | |
int mp = merge_path_float(a_count, b_count, min(gid, a_count + b_count), a_global0 + a0, b_global0 + b0); | |
mp_global[partition] = mp; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int x, | |
int y | |
) | |
{ | |
return x < y; | |
} | |
void global_to_regstr_pred_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_float(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_float | |
( | |
int count, | |
const float * reg, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_float | |
( | |
int count, | |
global const float * source, | |
int tid, | |
local float * dest | |
) | |
{ | |
float reg[11]; | |
global_to_regstr_1_11_float(count, source, tid, reg); | |
regstr_to_shared_1_11_float(reg, tid, dest); | |
} | |
void shared_to_global_1_11_float | |
( | |
int count, | |
local const float * source, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_int(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_int | |
( | |
int count, | |
const int * reg, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_int | |
( | |
int count, | |
global const int * source, | |
int tid, | |
local int * dest | |
) | |
{ | |
int reg[11]; | |
global_to_regstr_1_11_int(count, source, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, dest); | |
} | |
void shared_to_global_1_11_int | |
( | |
int count, | |
local const int * source, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void serial_merge_11_int | |
( | |
int a_begin, | |
int a_end, | |
int b_begin, | |
int b_end, | |
int * indices, | |
local const int * keys_shared0, | |
int * results0 | |
) | |
{ | |
int a_key0 = keys_shared0[a_begin]; | |
int b_key0 = keys_shared0[b_begin]; | |
bool p; | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[0] = p ? a_key0 : b_key0; | |
indices[0] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[1] = p ? a_key0 : b_key0; | |
indices[1] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[2] = p ? a_key0 : b_key0; | |
indices[2] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[3] = p ? a_key0 : b_key0; | |
indices[3] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[4] = p ? a_key0 : b_key0; | |
indices[4] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[5] = p ? a_key0 : b_key0; | |
indices[5] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[6] = p ? a_key0 : b_key0; | |
indices[6] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[7] = p ? a_key0 : b_key0; | |
indices[7] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[8] = p ? a_key0 : b_key0; | |
indices[8] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[9] = p ? a_key0 : b_key0; | |
indices[9] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[10] = p ? a_key0 : b_key0; | |
indices[10] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void swap_int | |
( | |
int * a0, | |
int * b0 | |
) | |
{ | |
{ | |
int c = *a0; | |
*a0 = *b0; | |
*b0 = c; | |
} | |
} | |
void swap_float | |
( | |
float * a0, | |
float * b0 | |
) | |
{ | |
{ | |
float c = *a0; | |
*a0 = *b0; | |
*b0 = c; | |
} | |
} | |
void odd_even_transpose_sort_11_int_float | |
( | |
int * keys0, | |
float * vals0 | |
) | |
{ | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_int(keys0 + 1, keys0 + 2); | |
swap_float(vals0 + 1, vals0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_int(keys0 + 3, keys0 + 4); | |
swap_float(vals0 + 3, vals0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_int(keys0 + 5, keys0 + 6); | |
swap_float(vals0 + 5, vals0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_int(keys0 + 7, keys0 + 8); | |
swap_float(vals0 + 7, vals0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_int(keys0 + 9, keys0 + 10); | |
swap_float(vals0 + 9, vals0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_int(keys0 + 1, keys0 + 2); | |
swap_float(vals0 + 1, vals0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_int(keys0 + 3, keys0 + 4); | |
swap_float(vals0 + 3, vals0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_int(keys0 + 5, keys0 + 6); | |
swap_float(vals0 + 5, vals0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_int(keys0 + 7, keys0 + 8); | |
swap_float(vals0 + 7, vals0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_int(keys0 + 9, keys0 + 10); | |
swap_float(vals0 + 9, vals0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_int(keys0 + 1, keys0 + 2); | |
swap_float(vals0 + 1, vals0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_int(keys0 + 3, keys0 + 4); | |
swap_float(vals0 + 3, vals0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_int(keys0 + 5, keys0 + 6); | |
swap_float(vals0 + 5, vals0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_int(keys0 + 7, keys0 + 8); | |
swap_float(vals0 + 7, vals0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_int(keys0 + 9, keys0 + 10); | |
swap_float(vals0 + 9, vals0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_int(keys0 + 1, keys0 + 2); | |
swap_float(vals0 + 1, vals0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_int(keys0 + 3, keys0 + 4); | |
swap_float(vals0 + 3, vals0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_int(keys0 + 5, keys0 + 6); | |
swap_float(vals0 + 5, vals0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_int(keys0 + 7, keys0 + 8); | |
swap_float(vals0 + 7, vals0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_int(keys0 + 9, keys0 + 10); | |
swap_float(vals0 + 9, vals0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_int(keys0 + 1, keys0 + 2); | |
swap_float(vals0 + 1, vals0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_int(keys0 + 3, keys0 + 4); | |
swap_float(vals0 + 3, vals0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_int(keys0 + 5, keys0 + 6); | |
swap_float(vals0 + 5, vals0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_int(keys0 + 7, keys0 + 8); | |
swap_float(vals0 + 7, vals0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_int(keys0 + 9, keys0 + 10); | |
swap_float(vals0 + 9, vals0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
} | |
int merge_path_int | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
local const int * a0, | |
local const int * b0 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
void block_sort_pass_1_11_int | |
( | |
int tid, | |
int count, | |
int coop, | |
int * indices, | |
local const int * keys_shared0, | |
int * keys0 | |
) | |
{ | |
int list = ~(coop - 1) & tid; | |
int diag = min(count, 11 * ((coop - 1) & tid)); | |
int start = 11 * list; | |
int a0 = min(count, start); | |
int b0 = min(count, start + 11 * (coop / 2)); | |
int b1 = min(count, start + 11 * coop); | |
int p = merge_path_int(b0 - a0, b1 - b0, diag, keys_shared0 + a0, keys_shared0 + b0); | |
serial_merge_11_int(a0 + p, b0, b0 + diag - p, b1, indices, keys_shared0, keys0); | |
} | |
void gather_1_11_float | |
( | |
const int * indices, | |
int tid, | |
local const float * data0, | |
float * reg0 | |
) | |
{ | |
reg0[0] = data0[indices[0]]; | |
reg0[1] = data0[indices[1]]; | |
reg0[2] = data0[indices[2]]; | |
reg0[3] = data0[indices[3]]; | |
reg0[4] = data0[indices[4]]; | |
reg0[5] = data0[indices[5]]; | |
reg0[6] = data0[indices[6]]; | |
reg0[7] = data0[indices[7]]; | |
reg0[8] = data0[indices[8]]; | |
reg0[9] = data0[indices[9]]; | |
reg0[10] = data0[indices[10]]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void block_sort_loop_1_11_int_float | |
( | |
int tid, | |
int count, | |
local int * keys_shared0, | |
float * thread_vals0, | |
local float * vals_shared0 | |
) | |
{ | |
int indices[11]; | |
int keys0[11]; | |
} | |
void mergesort_1_11_int_float | |
( | |
int count, | |
int tid, | |
int * thread_keys0, | |
local int * keys_shared0, | |
float * thread_vals0, | |
local float * vals_shared0 | |
) | |
{ | |
if(11 * tid < count) odd_even_transpose_sort_11_int_float(thread_keys0, thread_vals0); | |
thread_to_shared_11_int(thread_keys0, tid, keys_shared0); | |
block_sort_loop_1_11_int_float(tid, count, keys_shared0, thread_vals0, vals_shared0); | |
} | |
kernel void block_sort | |
( | |
int count, | |
global const int * keys_src0, | |
global int * keys_dst0, | |
global const float * vals_src0, | |
global float * vals_dst0 | |
) | |
{ | |
union Shared | |
{ | |
struct | |
{ | |
int keys0[12]; | |
}; | |
struct | |
{ | |
float vals0[11]; | |
}; | |
}; | |
local union Shared shared; | |
int tid = get_local_id(0); | |
int block = get_group_id(0); | |
int gid = 11 * block; | |
int count2 = min(11, count - gid); | |
float thread_vals0[11]; | |
global_to_shared_1_11_float(count2, vals_src0 + gid, tid, shared.vals0); | |
shared_to_thread_11_float(shared.vals0, tid, thread_vals0); | |
int thread_keys0[11]; | |
global_to_shared_1_11_int(count2, keys_src0 + gid, tid, shared.keys0); | |
shared_to_thread_11_int(shared.keys0, tid, thread_keys0); | |
int first = 11 * tid; | |
if(first + 11 > count2 && first < count2) | |
{ | |
int max_key0 = thread_keys0[0]; | |
if(first + 1 < count2 && comp(max_key0, thread_keys0[1]) ) | |
{ | |
max_key0 = thread_keys0[1]; | |
} | |
if(first + 2 < count2 && comp(max_key0, thread_keys0[2]) ) | |
{ | |
max_key0 = thread_keys0[2]; | |
} | |
if(first + 3 < count2 && comp(max_key0, thread_keys0[3]) ) | |
{ | |
max_key0 = thread_keys0[3]; | |
} | |
if(first + 4 < count2 && comp(max_key0, thread_keys0[4]) ) | |
{ | |
max_key0 = thread_keys0[4]; | |
} | |
if(first + 5 < count2 && comp(max_key0, thread_keys0[5]) ) | |
{ | |
max_key0 = thread_keys0[5]; | |
} | |
if(first + 6 < count2 && comp(max_key0, thread_keys0[6]) ) | |
{ | |
max_key0 = thread_keys0[6]; | |
} | |
if(first + 7 < count2 && comp(max_key0, thread_keys0[7]) ) | |
{ | |
max_key0 = thread_keys0[7]; | |
} | |
if(first + 8 < count2 && comp(max_key0, thread_keys0[8]) ) | |
{ | |
max_key0 = thread_keys0[8]; | |
} | |
if(first + 9 < count2 && comp(max_key0, thread_keys0[9]) ) | |
{ | |
max_key0 = thread_keys0[9]; | |
} | |
if(first + 10 < count2 && comp(max_key0, thread_keys0[10]) ) | |
{ | |
max_key0 = thread_keys0[10]; | |
} | |
if(first + 0 >= count2) | |
{ | |
thread_keys0[0] = max_key0; | |
} | |
if(first + 1 >= count2) | |
{ | |
thread_keys0[1] = max_key0; | |
} | |
if(first + 2 >= count2) | |
{ | |
thread_keys0[2] = max_key0; | |
} | |
if(first + 3 >= count2) | |
{ | |
thread_keys0[3] = max_key0; | |
} | |
if(first + 4 >= count2) | |
{ | |
thread_keys0[4] = max_key0; | |
} | |
if(first + 5 >= count2) | |
{ | |
thread_keys0[5] = max_key0; | |
} | |
if(first + 6 >= count2) | |
{ | |
thread_keys0[6] = max_key0; | |
} | |
if(first + 7 >= count2) | |
{ | |
thread_keys0[7] = max_key0; | |
} | |
if(first + 8 >= count2) | |
{ | |
thread_keys0[8] = max_key0; | |
} | |
if(first + 9 >= count2) | |
{ | |
thread_keys0[9] = max_key0; | |
} | |
if(first + 10 >= count2) | |
{ | |
thread_keys0[10] = max_key0; | |
} | |
} | |
mergesort_1_11_int_float(count2, tid, thread_keys0, shared.keys0, thread_vals0, shared.vals0); | |
shared_to_global_1_11_int(count2, shared.keys0, tid, keys_dst0 + gid); | |
thread_to_shared_11_float(thread_vals0, tid, shared.vals0); | |
shared_to_global_1_11_float(count2, shared.vals0, tid, vals_dst0 + gid); | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int x, | |
int y | |
) | |
{ | |
return x < y; | |
} | |
int4 find_mergesort_frame | |
( | |
int coop, | |
int block, | |
int nv | |
) | |
{ | |
int start = ~(coop - 1) & block; | |
int size = nv * (coop>> 1); | |
int4 frame; | |
frame.x = nv * start; | |
frame.y = nv * start + size; | |
frame.z = size; | |
return frame; | |
} | |
int4 find_mergesort_interval | |
( | |
int4 frame, | |
int coop, | |
int block, | |
int nv, | |
int count, | |
int mp0, | |
int mp1 | |
) | |
{ | |
int diag = nv * block - frame.x; | |
int4 interval; | |
interval.x = frame.x + mp0; | |
interval.y = min(count, frame.x + mp1); | |
interval.z = min(count, frame.y + diag - mp0); | |
interval.w = min(count, frame.y + diag + nv - mp1); | |
if(coop - 1 == ((coop - 1) & block)) | |
{ | |
interval.y = min(count, frame.x + frame.z); | |
interval.w = min(count, frame.y + frame.z); | |
} | |
return interval; | |
} | |
int4 compute_merge_range | |
( | |
int a_count, | |
int b_count, | |
int block, | |
int coop, | |
int nv, | |
global const int * mp_global | |
) | |
{ | |
int mp0 = mp_global[block]; | |
int mp1 = mp_global[block + 1]; | |
int gid = nv * block; | |
int4 range; | |
if(coop) | |
{ | |
int4 frame = find_mergesort_frame(coop, block, nv); | |
range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1); | |
} | |
else | |
{ | |
range.x = mp0; | |
range.y = mp1; | |
range.z = gid - range.x; | |
range.w = min(a_count + b_count, gid + nv) - range.y; | |
} | |
return range; | |
} | |
void global_to_regstr_pred_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_float(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_float | |
( | |
int count, | |
const float * reg, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_float | |
( | |
int count, | |
global const float * source, | |
int tid, | |
local float * dest | |
) | |
{ | |
float reg[11]; | |
global_to_regstr_1_11_float(count, source, tid, reg); | |
regstr_to_shared_1_11_float(reg, tid, dest); | |
} | |
void shared_to_global_1_11_float | |
( | |
int count, | |
local const float * source, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_int(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_int | |
( | |
int count, | |
const int * reg, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_int | |
( | |
int count, | |
global const int * source, | |
int tid, | |
local int * dest | |
) | |
{ | |
int reg[11]; | |
global_to_regstr_1_11_int(count, source, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, dest); | |
} | |
void shared_to_global_1_11_int | |
( | |
int count, | |
local const int * source, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void serial_merge_11_int | |
( | |
int a_begin, | |
int a_end, | |
int b_begin, | |
int b_end, | |
int * indices, | |
local const int * keys_shared0, | |
int * results0 | |
) | |
{ | |
int a_key0 = keys_shared0[a_begin]; | |
int b_key0 = keys_shared0[b_begin]; | |
bool p; | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[0] = p ? a_key0 : b_key0; | |
indices[0] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[1] = p ? a_key0 : b_key0; | |
indices[1] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[2] = p ? a_key0 : b_key0; | |
indices[2] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[3] = p ? a_key0 : b_key0; | |
indices[3] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[4] = p ? a_key0 : b_key0; | |
indices[4] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[5] = p ? a_key0 : b_key0; | |
indices[5] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[6] = p ? a_key0 : b_key0; | |
indices[6] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[7] = p ? a_key0 : b_key0; | |
indices[7] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[8] = p ? a_key0 : b_key0; | |
indices[8] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[9] = p ? a_key0 : b_key0; | |
indices[9] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[10] = p ? a_key0 : b_key0; | |
indices[10] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
int merge_path_int | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
local const int * a0, | |
local const int * b0 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
void load2_to_regstr_1_11_11_int | |
( | |
global const int * a_global, | |
int a_count, | |
global const int * b_global, | |
int b_count, | |
int tid, | |
int * reg | |
) | |
{ | |
b_global -= a_count; | |
int total = a_count + b_count; | |
int index; | |
if (total >= 11) | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else reg[6] = b_global[index]; | |
index = 7 + tid; | |
if (index < a_count) reg[7] = a_global[index]; | |
else reg[7] = b_global[index]; | |
index = 8 + tid; | |
if (index < a_count) reg[8] = a_global[index]; | |
else reg[8] = b_global[index]; | |
index = 9 + tid; | |
if (index < a_count) reg[9] = a_global[index]; | |
else reg[9] = b_global[index]; | |
index = 10 + tid; | |
if (index < a_count) reg[10] = a_global[index]; | |
else reg[10] = b_global[index]; | |
} | |
else | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else if (index < total) reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else if (index < total) reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else if (index < total) reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else if (index < total) reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else if (index < total) reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else if (index < total) reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else if (index < total) reg[6] = b_global[index]; | |
index = 7 + tid; | |
if (index < a_count) reg[7] = a_global[index]; | |
else if (index < total) reg[7] = b_global[index]; | |
index = 8 + tid; | |
if (index < a_count) reg[8] = a_global[index]; | |
else if (index < total) reg[8] = b_global[index]; | |
index = 9 + tid; | |
if (index < a_count) reg[9] = a_global[index]; | |
else if (index < total) reg[9] = b_global[index]; | |
index = 10 + tid; | |
if (index < a_count) reg[10] = a_global[index]; | |
else if (index < total) reg[10] = b_global[index]; | |
} | |
} | |
void load2_to_shared_1_11_11_int | |
( | |
global const int * a_global, | |
int a_count, | |
global const int * b_global, | |
int b_count, | |
int tid, | |
local int * shared | |
) | |
{ | |
int reg[11]; | |
load2_to_regstr_1_11_11_int(a_global, a_count, b_global, b_count, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, shared); | |
} | |
void merge_keys_indices_1_11_int | |
( | |
int a_count, | |
int b_count, | |
int4 range, | |
int tid, | |
int * indices, | |
global const int * a_global0, | |
global const int * b_global0, | |
local int * keys_shared0, | |
int * results0 | |
) | |
{ | |
int a0 = range.x; | |
int a1 = range.y; | |
int b0 = range.z; | |
int b1 = range.w; | |
a_count = a1 - a0; | |
b_count = b1 - b0; | |
load2_to_shared_1_11_11_int(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0); | |
int diag = 11 * tid; | |
int mp = merge_path_int(a_count, b_count, diag, keys_shared0, keys_shared0 + a_count); | |
int a0tid = mp; | |
int a1tid = a_count; | |
int b0tid = a_count + diag - mp; | |
int b1tid = a_count + b_count; | |
serial_merge_11_int(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, results0); | |
} | |
void transfer_merge_values_regstr_1_11_float | |
( | |
int count, | |
int b_start, | |
const int * indices, | |
int tid, | |
global const float * a_global0, | |
global const float * b_global0, | |
float * reg0 | |
) | |
{ | |
b_global0 -= b_start; | |
if(count >= 11) | |
{ | |
if (indices[0] < b_start) | |
{ | |
reg0[0] = a_global0[indices[0]]; | |
} | |
else | |
{ | |
reg0[0] = b_global0[indices[0]]; | |
} | |
if (indices[1] < b_start) | |
{ | |
reg0[1] = a_global0[indices[1]]; | |
} | |
else | |
{ | |
reg0[1] = b_global0[indices[1]]; | |
} | |
if (indices[2] < b_start) | |
{ | |
reg0[2] = a_global0[indices[2]]; | |
} | |
else | |
{ | |
reg0[2] = b_global0[indices[2]]; | |
} | |
if (indices[3] < b_start) | |
{ | |
reg0[3] = a_global0[indices[3]]; | |
} | |
else | |
{ | |
reg0[3] = b_global0[indices[3]]; | |
} | |
if (indices[4] < b_start) | |
{ | |
reg0[4] = a_global0[indices[4]]; | |
} | |
else | |
{ | |
reg0[4] = b_global0[indices[4]]; | |
} | |
if (indices[5] < b_start) | |
{ | |
reg0[5] = a_global0[indices[5]]; | |
} | |
else | |
{ | |
reg0[5] = b_global0[indices[5]]; | |
} | |
if (indices[6] < b_start) | |
{ | |
reg0[6] = a_global0[indices[6]]; | |
} | |
else | |
{ | |
reg0[6] = b_global0[indices[6]]; | |
} | |
if (indices[7] < b_start) | |
{ | |
reg0[7] = a_global0[indices[7]]; | |
} | |
else | |
{ | |
reg0[7] = b_global0[indices[7]]; | |
} | |
if (indices[8] < b_start) | |
{ | |
reg0[8] = a_global0[indices[8]]; | |
} | |
else | |
{ | |
reg0[8] = b_global0[indices[8]]; | |
} | |
if (indices[9] < b_start) | |
{ | |
reg0[9] = a_global0[indices[9]]; | |
} | |
else | |
{ | |
reg0[9] = b_global0[indices[9]]; | |
} | |
if (indices[10] < b_start) | |
{ | |
reg0[10] = a_global0[indices[10]]; | |
} | |
else | |
{ | |
reg0[10] = b_global0[indices[10]]; | |
} | |
} | |
else | |
{ | |
int index; | |
index = 0 + tid; | |
if(index < count) | |
{ | |
if (indices[0] < b_start) | |
{ | |
reg0[0] = a_global0[indices[0]]; | |
} | |
else | |
{ | |
reg0[0] = b_global0[indices[0]]; | |
} | |
} | |
index = 1 + tid; | |
if(index < count) | |
{ | |
if (indices[1] < b_start) | |
{ | |
reg0[1] = a_global0[indices[1]]; | |
} | |
else | |
{ | |
reg0[1] = b_global0[indices[1]]; | |
} | |
} | |
index = 2 + tid; | |
if(index < count) | |
{ | |
if (indices[2] < b_start) | |
{ | |
reg0[2] = a_global0[indices[2]]; | |
} | |
else | |
{ | |
reg0[2] = b_global0[indices[2]]; | |
} | |
} | |
index = 3 + tid; | |
if(index < count) | |
{ | |
if (indices[3] < b_start) | |
{ | |
reg0[3] = a_global0[indices[3]]; | |
} | |
else | |
{ | |
reg0[3] = b_global0[indices[3]]; | |
} | |
} | |
index = 4 + tid; | |
if(index < count) | |
{ | |
if (indices[4] < b_start) | |
{ | |
reg0[4] = a_global0[indices[4]]; | |
} | |
else | |
{ | |
reg0[4] = b_global0[indices[4]]; | |
} | |
} | |
index = 5 + tid; | |
if(index < count) | |
{ | |
if (indices[5] < b_start) | |
{ | |
reg0[5] = a_global0[indices[5]]; | |
} | |
else | |
{ | |
reg0[5] = b_global0[indices[5]]; | |
} | |
} | |
index = 6 + tid; | |
if(index < count) | |
{ | |
if (indices[6] < b_start) | |
{ | |
reg0[6] = a_global0[indices[6]]; | |
} | |
else | |
{ | |
reg0[6] = b_global0[indices[6]]; | |
} | |
} | |
index = 7 + tid; | |
if(index < count) | |
{ | |
if (indices[7] < b_start) | |
{ | |
reg0[7] = a_global0[indices[7]]; | |
} | |
else | |
{ | |
reg0[7] = b_global0[indices[7]]; | |
} | |
} | |
index = 8 + tid; | |
if(index < count) | |
{ | |
if (indices[8] < b_start) | |
{ | |
reg0[8] = a_global0[indices[8]]; | |
} | |
else | |
{ | |
reg0[8] = b_global0[indices[8]]; | |
} | |
} | |
index = 9 + tid; | |
if(index < count) | |
{ | |
if (indices[9] < b_start) | |
{ | |
reg0[9] = a_global0[indices[9]]; | |
} | |
else | |
{ | |
reg0[9] = b_global0[indices[9]]; | |
} | |
} | |
index = 10 + tid; | |
if(index < count) | |
{ | |
if (indices[10] < b_start) | |
{ | |
reg0[10] = a_global0[indices[10]]; | |
} | |
else | |
{ | |
reg0[10] = b_global0[indices[10]]; | |
} | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void transfer_merge_values_shared_1_11_float | |
( | |
int count, | |
int b_start, | |
local const int * indices_shared, | |
int tid, | |
global const float * a_global0, | |
global const float * b_global0, | |
global float * dest_global0 | |
) | |
{ | |
int indices[11]; | |
shared_to_regstr_1_11_int(indices_shared, tid, indices); | |
float reg0[11]; | |
transfer_merge_values_regstr_1_11_float(count, b_start, indices, tid, a_global0, b_global0, reg0); | |
regstr_to_global_1_11_float(count, reg0, tid, dest_global0); | |
} | |
void device_merge_1_11_int_float | |
( | |
int a_count, | |
int b_count, | |
global const int * a_keys_global0, | |
global const int * b_keys_global0, | |
global int * keys_global0, | |
local int * keys_shared0, | |
global const float * a_vals_global0, | |
global const float * b_vals_global0, | |
global float * vals_global0, | |
int tid, | |
int block, | |
int4 range, | |
local int * indices_shared | |
) | |
{ | |
int results0[11]; | |
int indices[11]; | |
merge_keys_indices_1_11_int(a_count, b_count, range, tid, indices, a_keys_global0, b_keys_global0, keys_shared0, results0); | |
thread_to_shared_11_int(results0, tid, keys_shared0); | |
a_count = range.y - range.x; | |
b_count = range.w - range.z; | |
shared_to_global_1_11_int(a_count + b_count, keys_shared0, tid, keys_global0 + 11 * block); | |
thread_to_shared_11_int(indices, tid, indices_shared); | |
transfer_merge_values_shared_1_11_float(a_count + b_count, a_count, indices_shared, tid, a_vals_global0 + range.x, b_vals_global0 + range.z, vals_global0 + 11 * block); | |
} | |
kernel void merge | |
( | |
int a_count, | |
int b_count, | |
global const int * a_keys_global0, | |
global const int * b_keys_global0, | |
global int * keys_global0, | |
global const float * a_vals_global0, | |
global const float * b_vals_global0, | |
global float * vals_global0, | |
global const int * mp_global, | |
int coop | |
) | |
{ | |
union Shared | |
{ | |
struct | |
{ | |
int keys0[12]; | |
}; | |
int indices[11]; | |
}; | |
local union Shared shared; | |
int tid = get_local_id(0); | |
int block = get_group_id(0); | |
int4 range = compute_merge_range(a_count, b_count, block, coop, 11, mp_global); | |
device_merge_1_11_int_float(a_count, b_count, a_keys_global0, b_keys_global0, keys_global0, shared.keys0, a_vals_global0, b_vals_global0, vals_global0, tid, block, range, shared.indices); | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int x, | |
int y | |
) | |
{ | |
return x < y; | |
} | |
int merge_path_int | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
global const int * a0, | |
global const int * b0 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
int4 find_mergesort_frame | |
( | |
int coop, | |
int block, | |
int nv | |
) | |
{ | |
int start = ~(coop - 1) & block; | |
int size = nv * (coop>> 1); | |
int4 frame; | |
frame.x = nv * start; | |
frame.y = nv * start + size; | |
frame.z = size; | |
return frame; | |
} | |
kernel void merge_partition | |
( | |
int a_count, | |
int b_count, | |
int nv, | |
int coop, | |
global int * mp_global, | |
int num_searches, | |
global const int * a_global0, | |
global const int * b_global0 | |
) | |
{ | |
int partition = get_global_id(0); | |
if (partition < num_searches) | |
{ | |
int a0 = 0, b0 = 0; | |
int gid = nv * partition; | |
if(coop) | |
{ | |
int4 frame = find_mergesort_frame(coop, partition, nv); | |
a0 = frame.x; | |
b0 = min(a_count, frame.y); | |
b_count = min(a_count, frame.y + frame.z) - b0; | |
a_count = min(a_count, frame.x + frame.z) - a0; | |
gid -= a0; | |
} | |
int mp = merge_path_int(a_count, b_count, min(gid, a_count + b_count), a_global0 + a0, b_global0 + b0); | |
mp_global[partition] = mp; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a, | |
int b | |
) | |
{ | |
char bit1 = 1 & a; char bit2 = 1 & b; if (bit1 == bit2) return a < b; return bit1 < bit2; | |
} | |
void global_to_regstr_pred_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_float(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_float | |
( | |
int count, | |
const float * reg, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_float | |
( | |
int count, | |
global const float * source, | |
int tid, | |
local float * dest | |
) | |
{ | |
float reg[11]; | |
global_to_regstr_1_11_float(count, source, tid, reg); | |
regstr_to_shared_1_11_float(reg, tid, dest); | |
} | |
void shared_to_global_1_11_float | |
( | |
int count, | |
local const float * source, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_int(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_int | |
( | |
int count, | |
const int * reg, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_int | |
( | |
int count, | |
global const int * source, | |
int tid, | |
local int * dest | |
) | |
{ | |
int reg[11]; | |
global_to_regstr_1_11_int(count, source, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, dest); | |
} | |
void shared_to_global_1_11_int | |
( | |
int count, | |
local const int * source, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void serial_merge_11_int | |
( | |
int a_begin, | |
int a_end, | |
int b_begin, | |
int b_end, | |
int * indices, | |
local const int * keys_shared0, | |
int * results0 | |
) | |
{ | |
int a_key0 = keys_shared0[a_begin]; | |
int b_key0 = keys_shared0[b_begin]; | |
bool p; | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[0] = p ? a_key0 : b_key0; | |
indices[0] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[1] = p ? a_key0 : b_key0; | |
indices[1] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[2] = p ? a_key0 : b_key0; | |
indices[2] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[3] = p ? a_key0 : b_key0; | |
indices[3] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[4] = p ? a_key0 : b_key0; | |
indices[4] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[5] = p ? a_key0 : b_key0; | |
indices[5] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[6] = p ? a_key0 : b_key0; | |
indices[6] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[7] = p ? a_key0 : b_key0; | |
indices[7] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[8] = p ? a_key0 : b_key0; | |
indices[8] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[9] = p ? a_key0 : b_key0; | |
indices[9] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[10] = p ? a_key0 : b_key0; | |
indices[10] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void swap_int | |
( | |
int * a0, | |
int * b0 | |
) | |
{ | |
{ | |
int c = *a0; | |
*a0 = *b0; | |
*b0 = c; | |
} | |
} | |
void swap_float | |
( | |
float * a0, | |
float * b0 | |
) | |
{ | |
{ | |
float c = *a0; | |
*a0 = *b0; | |
*b0 = c; | |
} | |
} | |
void odd_even_transpose_sort_11_int_float | |
( | |
int * keys0, | |
float * vals0 | |
) | |
{ | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_int(keys0 + 1, keys0 + 2); | |
swap_float(vals0 + 1, vals0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_int(keys0 + 3, keys0 + 4); | |
swap_float(vals0 + 3, vals0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_int(keys0 + 5, keys0 + 6); | |
swap_float(vals0 + 5, vals0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_int(keys0 + 7, keys0 + 8); | |
swap_float(vals0 + 7, vals0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_int(keys0 + 9, keys0 + 10); | |
swap_float(vals0 + 9, vals0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_int(keys0 + 1, keys0 + 2); | |
swap_float(vals0 + 1, vals0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_int(keys0 + 3, keys0 + 4); | |
swap_float(vals0 + 3, vals0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_int(keys0 + 5, keys0 + 6); | |
swap_float(vals0 + 5, vals0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_int(keys0 + 7, keys0 + 8); | |
swap_float(vals0 + 7, vals0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_int(keys0 + 9, keys0 + 10); | |
swap_float(vals0 + 9, vals0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_int(keys0 + 1, keys0 + 2); | |
swap_float(vals0 + 1, vals0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_int(keys0 + 3, keys0 + 4); | |
swap_float(vals0 + 3, vals0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_int(keys0 + 5, keys0 + 6); | |
swap_float(vals0 + 5, vals0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_int(keys0 + 7, keys0 + 8); | |
swap_float(vals0 + 7, vals0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_int(keys0 + 9, keys0 + 10); | |
swap_float(vals0 + 9, vals0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_int(keys0 + 1, keys0 + 2); | |
swap_float(vals0 + 1, vals0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_int(keys0 + 3, keys0 + 4); | |
swap_float(vals0 + 3, vals0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_int(keys0 + 5, keys0 + 6); | |
swap_float(vals0 + 5, vals0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_int(keys0 + 7, keys0 + 8); | |
swap_float(vals0 + 7, vals0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_int(keys0 + 9, keys0 + 10); | |
swap_float(vals0 + 9, vals0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
if (comp(keys0[2], keys0[1])) | |
{ | |
swap_int(keys0 + 1, keys0 + 2); | |
swap_float(vals0 + 1, vals0 + 2); | |
} | |
if (comp(keys0[4], keys0[3])) | |
{ | |
swap_int(keys0 + 3, keys0 + 4); | |
swap_float(vals0 + 3, vals0 + 4); | |
} | |
if (comp(keys0[6], keys0[5])) | |
{ | |
swap_int(keys0 + 5, keys0 + 6); | |
swap_float(vals0 + 5, vals0 + 6); | |
} | |
if (comp(keys0[8], keys0[7])) | |
{ | |
swap_int(keys0 + 7, keys0 + 8); | |
swap_float(vals0 + 7, vals0 + 8); | |
} | |
if (comp(keys0[10], keys0[9])) | |
{ | |
swap_int(keys0 + 9, keys0 + 10); | |
swap_float(vals0 + 9, vals0 + 10); | |
} | |
if (comp(keys0[1], keys0[0])) | |
{ | |
swap_int(keys0 + 0, keys0 + 1); | |
swap_float(vals0 + 0, vals0 + 1); | |
} | |
if (comp(keys0[3], keys0[2])) | |
{ | |
swap_int(keys0 + 2, keys0 + 3); | |
swap_float(vals0 + 2, vals0 + 3); | |
} | |
if (comp(keys0[5], keys0[4])) | |
{ | |
swap_int(keys0 + 4, keys0 + 5); | |
swap_float(vals0 + 4, vals0 + 5); | |
} | |
if (comp(keys0[7], keys0[6])) | |
{ | |
swap_int(keys0 + 6, keys0 + 7); | |
swap_float(vals0 + 6, vals0 + 7); | |
} | |
if (comp(keys0[9], keys0[8])) | |
{ | |
swap_int(keys0 + 8, keys0 + 9); | |
swap_float(vals0 + 8, vals0 + 9); | |
} | |
} | |
int merge_path_int | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
local const int * a0, | |
local const int * b0 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
void block_sort_pass_1_11_int | |
( | |
int tid, | |
int count, | |
int coop, | |
int * indices, | |
local const int * keys_shared0, | |
int * keys0 | |
) | |
{ | |
int list = ~(coop - 1) & tid; | |
int diag = min(count, 11 * ((coop - 1) & tid)); | |
int start = 11 * list; | |
int a0 = min(count, start); | |
int b0 = min(count, start + 11 * (coop / 2)); | |
int b1 = min(count, start + 11 * coop); | |
int p = merge_path_int(b0 - a0, b1 - b0, diag, keys_shared0 + a0, keys_shared0 + b0); | |
serial_merge_11_int(a0 + p, b0, b0 + diag - p, b1, indices, keys_shared0, keys0); | |
} | |
void gather_1_11_float | |
( | |
const int * indices, | |
int tid, | |
local const float * data0, | |
float * reg0 | |
) | |
{ | |
reg0[0] = data0[indices[0]]; | |
reg0[1] = data0[indices[1]]; | |
reg0[2] = data0[indices[2]]; | |
reg0[3] = data0[indices[3]]; | |
reg0[4] = data0[indices[4]]; | |
reg0[5] = data0[indices[5]]; | |
reg0[6] = data0[indices[6]]; | |
reg0[7] = data0[indices[7]]; | |
reg0[8] = data0[indices[8]]; | |
reg0[9] = data0[indices[9]]; | |
reg0[10] = data0[indices[10]]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void block_sort_loop_1_11_int_float | |
( | |
int tid, | |
int count, | |
local int * keys_shared0, | |
float * thread_vals0, | |
local float * vals_shared0 | |
) | |
{ | |
int indices[11]; | |
int keys0[11]; | |
} | |
void mergesort_1_11_int_float | |
( | |
int count, | |
int tid, | |
int * thread_keys0, | |
local int * keys_shared0, | |
float * thread_vals0, | |
local float * vals_shared0 | |
) | |
{ | |
if(11 * tid < count) odd_even_transpose_sort_11_int_float(thread_keys0, thread_vals0); | |
thread_to_shared_11_int(thread_keys0, tid, keys_shared0); | |
block_sort_loop_1_11_int_float(tid, count, keys_shared0, thread_vals0, vals_shared0); | |
} | |
kernel void block_sort | |
( | |
int count, | |
global const int * keys_src0, | |
global int * keys_dst0, | |
global const float * vals_src0, | |
global float * vals_dst0 | |
) | |
{ | |
union Shared | |
{ | |
struct | |
{ | |
int keys0[12]; | |
}; | |
struct | |
{ | |
float vals0[11]; | |
}; | |
}; | |
local union Shared shared; | |
int tid = get_local_id(0); | |
int block = get_group_id(0); | |
int gid = 11 * block; | |
int count2 = min(11, count - gid); | |
float thread_vals0[11]; | |
global_to_shared_1_11_float(count2, vals_src0 + gid, tid, shared.vals0); | |
shared_to_thread_11_float(shared.vals0, tid, thread_vals0); | |
int thread_keys0[11]; | |
global_to_shared_1_11_int(count2, keys_src0 + gid, tid, shared.keys0); | |
shared_to_thread_11_int(shared.keys0, tid, thread_keys0); | |
int first = 11 * tid; | |
if(first + 11 > count2 && first < count2) | |
{ | |
int max_key0 = thread_keys0[0]; | |
if(first + 1 < count2 && comp(max_key0, thread_keys0[1]) ) | |
{ | |
max_key0 = thread_keys0[1]; | |
} | |
if(first + 2 < count2 && comp(max_key0, thread_keys0[2]) ) | |
{ | |
max_key0 = thread_keys0[2]; | |
} | |
if(first + 3 < count2 && comp(max_key0, thread_keys0[3]) ) | |
{ | |
max_key0 = thread_keys0[3]; | |
} | |
if(first + 4 < count2 && comp(max_key0, thread_keys0[4]) ) | |
{ | |
max_key0 = thread_keys0[4]; | |
} | |
if(first + 5 < count2 && comp(max_key0, thread_keys0[5]) ) | |
{ | |
max_key0 = thread_keys0[5]; | |
} | |
if(first + 6 < count2 && comp(max_key0, thread_keys0[6]) ) | |
{ | |
max_key0 = thread_keys0[6]; | |
} | |
if(first + 7 < count2 && comp(max_key0, thread_keys0[7]) ) | |
{ | |
max_key0 = thread_keys0[7]; | |
} | |
if(first + 8 < count2 && comp(max_key0, thread_keys0[8]) ) | |
{ | |
max_key0 = thread_keys0[8]; | |
} | |
if(first + 9 < count2 && comp(max_key0, thread_keys0[9]) ) | |
{ | |
max_key0 = thread_keys0[9]; | |
} | |
if(first + 10 < count2 && comp(max_key0, thread_keys0[10]) ) | |
{ | |
max_key0 = thread_keys0[10]; | |
} | |
if(first + 0 >= count2) | |
{ | |
thread_keys0[0] = max_key0; | |
} | |
if(first + 1 >= count2) | |
{ | |
thread_keys0[1] = max_key0; | |
} | |
if(first + 2 >= count2) | |
{ | |
thread_keys0[2] = max_key0; | |
} | |
if(first + 3 >= count2) | |
{ | |
thread_keys0[3] = max_key0; | |
} | |
if(first + 4 >= count2) | |
{ | |
thread_keys0[4] = max_key0; | |
} | |
if(first + 5 >= count2) | |
{ | |
thread_keys0[5] = max_key0; | |
} | |
if(first + 6 >= count2) | |
{ | |
thread_keys0[6] = max_key0; | |
} | |
if(first + 7 >= count2) | |
{ | |
thread_keys0[7] = max_key0; | |
} | |
if(first + 8 >= count2) | |
{ | |
thread_keys0[8] = max_key0; | |
} | |
if(first + 9 >= count2) | |
{ | |
thread_keys0[9] = max_key0; | |
} | |
if(first + 10 >= count2) | |
{ | |
thread_keys0[10] = max_key0; | |
} | |
} | |
mergesort_1_11_int_float(count2, tid, thread_keys0, shared.keys0, thread_vals0, shared.vals0); | |
shared_to_global_1_11_int(count2, shared.keys0, tid, keys_dst0 + gid); | |
thread_to_shared_11_float(thread_vals0, tid, shared.vals0); | |
shared_to_global_1_11_float(count2, shared.vals0, tid, vals_dst0 + gid); | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a, | |
int b | |
) | |
{ | |
char bit1 = 1 & a; char bit2 = 1 & b; if (bit1 == bit2) return a < b; return bit1 < bit2; | |
} | |
int4 find_mergesort_frame | |
( | |
int coop, | |
int block, | |
int nv | |
) | |
{ | |
int start = ~(coop - 1) & block; | |
int size = nv * (coop>> 1); | |
int4 frame; | |
frame.x = nv * start; | |
frame.y = nv * start + size; | |
frame.z = size; | |
return frame; | |
} | |
int4 find_mergesort_interval | |
( | |
int4 frame, | |
int coop, | |
int block, | |
int nv, | |
int count, | |
int mp0, | |
int mp1 | |
) | |
{ | |
int diag = nv * block - frame.x; | |
int4 interval; | |
interval.x = frame.x + mp0; | |
interval.y = min(count, frame.x + mp1); | |
interval.z = min(count, frame.y + diag - mp0); | |
interval.w = min(count, frame.y + diag + nv - mp1); | |
if(coop - 1 == ((coop - 1) & block)) | |
{ | |
interval.y = min(count, frame.x + frame.z); | |
interval.w = min(count, frame.y + frame.z); | |
} | |
return interval; | |
} | |
int4 compute_merge_range | |
( | |
int a_count, | |
int b_count, | |
int block, | |
int coop, | |
int nv, | |
global const int * mp_global | |
) | |
{ | |
int mp0 = mp_global[block]; | |
int mp1 = mp_global[block + 1]; | |
int gid = nv * block; | |
int4 range; | |
if(coop) | |
{ | |
int4 frame = find_mergesort_frame(coop, block, nv); | |
range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1); | |
} | |
else | |
{ | |
range.x = mp0; | |
range.y = mp1; | |
range.z = gid - range.x; | |
range.w = min(a_count + b_count, gid + nv) - range.y; | |
} | |
return range; | |
} | |
void global_to_regstr_pred_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_float(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_float | |
( | |
int count, | |
const float * reg, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_float | |
( | |
int count, | |
global const float * source, | |
int tid, | |
local float * dest | |
) | |
{ | |
float reg[11]; | |
global_to_regstr_1_11_float(count, source, tid, reg); | |
regstr_to_shared_1_11_float(reg, tid, dest); | |
} | |
void shared_to_global_1_11_float | |
( | |
int count, | |
local const float * source, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_int(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_int | |
( | |
int count, | |
const int * reg, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_int | |
( | |
int count, | |
global const int * source, | |
int tid, | |
local int * dest | |
) | |
{ | |
int reg[11]; | |
global_to_regstr_1_11_int(count, source, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, dest); | |
} | |
void shared_to_global_1_11_int | |
( | |
int count, | |
local const int * source, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void serial_merge_11_int | |
( | |
int a_begin, | |
int a_end, | |
int b_begin, | |
int b_end, | |
int * indices, | |
local const int * keys_shared0, | |
int * results0 | |
) | |
{ | |
int a_key0 = keys_shared0[a_begin]; | |
int b_key0 = keys_shared0[b_begin]; | |
bool p; | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[0] = p ? a_key0 : b_key0; | |
indices[0] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[1] = p ? a_key0 : b_key0; | |
indices[1] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[2] = p ? a_key0 : b_key0; | |
indices[2] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[3] = p ? a_key0 : b_key0; | |
indices[3] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[4] = p ? a_key0 : b_key0; | |
indices[4] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[5] = p ? a_key0 : b_key0; | |
indices[5] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[6] = p ? a_key0 : b_key0; | |
indices[6] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[7] = p ? a_key0 : b_key0; | |
indices[7] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[8] = p ? a_key0 : b_key0; | |
indices[8] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[9] = p ? a_key0 : b_key0; | |
indices[9] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, a_key0)); | |
results0[10] = p ? a_key0 : b_key0; | |
indices[10] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
int merge_path_int | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
local const int * a0, | |
local const int * b0 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
void load2_to_regstr_1_11_11_int | |
( | |
global const int * a_global, | |
int a_count, | |
global const int * b_global, | |
int b_count, | |
int tid, | |
int * reg | |
) | |
{ | |
b_global -= a_count; | |
int total = a_count + b_count; | |
int index; | |
if (total >= 11) | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else reg[6] = b_global[index]; | |
index = 7 + tid; | |
if (index < a_count) reg[7] = a_global[index]; | |
else reg[7] = b_global[index]; | |
index = 8 + tid; | |
if (index < a_count) reg[8] = a_global[index]; | |
else reg[8] = b_global[index]; | |
index = 9 + tid; | |
if (index < a_count) reg[9] = a_global[index]; | |
else reg[9] = b_global[index]; | |
index = 10 + tid; | |
if (index < a_count) reg[10] = a_global[index]; | |
else reg[10] = b_global[index]; | |
} | |
else | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else if (index < total) reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else if (index < total) reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else if (index < total) reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else if (index < total) reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else if (index < total) reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else if (index < total) reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else if (index < total) reg[6] = b_global[index]; | |
index = 7 + tid; | |
if (index < a_count) reg[7] = a_global[index]; | |
else if (index < total) reg[7] = b_global[index]; | |
index = 8 + tid; | |
if (index < a_count) reg[8] = a_global[index]; | |
else if (index < total) reg[8] = b_global[index]; | |
index = 9 + tid; | |
if (index < a_count) reg[9] = a_global[index]; | |
else if (index < total) reg[9] = b_global[index]; | |
index = 10 + tid; | |
if (index < a_count) reg[10] = a_global[index]; | |
else if (index < total) reg[10] = b_global[index]; | |
} | |
} | |
void load2_to_shared_1_11_11_int | |
( | |
global const int * a_global, | |
int a_count, | |
global const int * b_global, | |
int b_count, | |
int tid, | |
local int * shared | |
) | |
{ | |
int reg[11]; | |
load2_to_regstr_1_11_11_int(a_global, a_count, b_global, b_count, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, shared); | |
} | |
void merge_keys_indices_1_11_int | |
( | |
int a_count, | |
int b_count, | |
int4 range, | |
int tid, | |
int * indices, | |
global const int * a_global0, | |
global const int * b_global0, | |
local int * keys_shared0, | |
int * results0 | |
) | |
{ | |
int a0 = range.x; | |
int a1 = range.y; | |
int b0 = range.z; | |
int b1 = range.w; | |
a_count = a1 - a0; | |
b_count = b1 - b0; | |
load2_to_shared_1_11_11_int(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0); | |
int diag = 11 * tid; | |
int mp = merge_path_int(a_count, b_count, diag, keys_shared0, keys_shared0 + a_count); | |
int a0tid = mp; | |
int a1tid = a_count; | |
int b0tid = a_count + diag - mp; | |
int b1tid = a_count + b_count; | |
serial_merge_11_int(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, results0); | |
} | |
void transfer_merge_values_regstr_1_11_float | |
( | |
int count, | |
int b_start, | |
const int * indices, | |
int tid, | |
global const float * a_global0, | |
global const float * b_global0, | |
float * reg0 | |
) | |
{ | |
b_global0 -= b_start; | |
if(count >= 11) | |
{ | |
if (indices[0] < b_start) | |
{ | |
reg0[0] = a_global0[indices[0]]; | |
} | |
else | |
{ | |
reg0[0] = b_global0[indices[0]]; | |
} | |
if (indices[1] < b_start) | |
{ | |
reg0[1] = a_global0[indices[1]]; | |
} | |
else | |
{ | |
reg0[1] = b_global0[indices[1]]; | |
} | |
if (indices[2] < b_start) | |
{ | |
reg0[2] = a_global0[indices[2]]; | |
} | |
else | |
{ | |
reg0[2] = b_global0[indices[2]]; | |
} | |
if (indices[3] < b_start) | |
{ | |
reg0[3] = a_global0[indices[3]]; | |
} | |
else | |
{ | |
reg0[3] = b_global0[indices[3]]; | |
} | |
if (indices[4] < b_start) | |
{ | |
reg0[4] = a_global0[indices[4]]; | |
} | |
else | |
{ | |
reg0[4] = b_global0[indices[4]]; | |
} | |
if (indices[5] < b_start) | |
{ | |
reg0[5] = a_global0[indices[5]]; | |
} | |
else | |
{ | |
reg0[5] = b_global0[indices[5]]; | |
} | |
if (indices[6] < b_start) | |
{ | |
reg0[6] = a_global0[indices[6]]; | |
} | |
else | |
{ | |
reg0[6] = b_global0[indices[6]]; | |
} | |
if (indices[7] < b_start) | |
{ | |
reg0[7] = a_global0[indices[7]]; | |
} | |
else | |
{ | |
reg0[7] = b_global0[indices[7]]; | |
} | |
if (indices[8] < b_start) | |
{ | |
reg0[8] = a_global0[indices[8]]; | |
} | |
else | |
{ | |
reg0[8] = b_global0[indices[8]]; | |
} | |
if (indices[9] < b_start) | |
{ | |
reg0[9] = a_global0[indices[9]]; | |
} | |
else | |
{ | |
reg0[9] = b_global0[indices[9]]; | |
} | |
if (indices[10] < b_start) | |
{ | |
reg0[10] = a_global0[indices[10]]; | |
} | |
else | |
{ | |
reg0[10] = b_global0[indices[10]]; | |
} | |
} | |
else | |
{ | |
int index; | |
index = 0 + tid; | |
if(index < count) | |
{ | |
if (indices[0] < b_start) | |
{ | |
reg0[0] = a_global0[indices[0]]; | |
} | |
else | |
{ | |
reg0[0] = b_global0[indices[0]]; | |
} | |
} | |
index = 1 + tid; | |
if(index < count) | |
{ | |
if (indices[1] < b_start) | |
{ | |
reg0[1] = a_global0[indices[1]]; | |
} | |
else | |
{ | |
reg0[1] = b_global0[indices[1]]; | |
} | |
} | |
index = 2 + tid; | |
if(index < count) | |
{ | |
if (indices[2] < b_start) | |
{ | |
reg0[2] = a_global0[indices[2]]; | |
} | |
else | |
{ | |
reg0[2] = b_global0[indices[2]]; | |
} | |
} | |
index = 3 + tid; | |
if(index < count) | |
{ | |
if (indices[3] < b_start) | |
{ | |
reg0[3] = a_global0[indices[3]]; | |
} | |
else | |
{ | |
reg0[3] = b_global0[indices[3]]; | |
} | |
} | |
index = 4 + tid; | |
if(index < count) | |
{ | |
if (indices[4] < b_start) | |
{ | |
reg0[4] = a_global0[indices[4]]; | |
} | |
else | |
{ | |
reg0[4] = b_global0[indices[4]]; | |
} | |
} | |
index = 5 + tid; | |
if(index < count) | |
{ | |
if (indices[5] < b_start) | |
{ | |
reg0[5] = a_global0[indices[5]]; | |
} | |
else | |
{ | |
reg0[5] = b_global0[indices[5]]; | |
} | |
} | |
index = 6 + tid; | |
if(index < count) | |
{ | |
if (indices[6] < b_start) | |
{ | |
reg0[6] = a_global0[indices[6]]; | |
} | |
else | |
{ | |
reg0[6] = b_global0[indices[6]]; | |
} | |
} | |
index = 7 + tid; | |
if(index < count) | |
{ | |
if (indices[7] < b_start) | |
{ | |
reg0[7] = a_global0[indices[7]]; | |
} | |
else | |
{ | |
reg0[7] = b_global0[indices[7]]; | |
} | |
} | |
index = 8 + tid; | |
if(index < count) | |
{ | |
if (indices[8] < b_start) | |
{ | |
reg0[8] = a_global0[indices[8]]; | |
} | |
else | |
{ | |
reg0[8] = b_global0[indices[8]]; | |
} | |
} | |
index = 9 + tid; | |
if(index < count) | |
{ | |
if (indices[9] < b_start) | |
{ | |
reg0[9] = a_global0[indices[9]]; | |
} | |
else | |
{ | |
reg0[9] = b_global0[indices[9]]; | |
} | |
} | |
index = 10 + tid; | |
if(index < count) | |
{ | |
if (indices[10] < b_start) | |
{ | |
reg0[10] = a_global0[indices[10]]; | |
} | |
else | |
{ | |
reg0[10] = b_global0[indices[10]]; | |
} | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void transfer_merge_values_shared_1_11_float | |
( | |
int count, | |
int b_start, | |
local const int * indices_shared, | |
int tid, | |
global const float * a_global0, | |
global const float * b_global0, | |
global float * dest_global0 | |
) | |
{ | |
int indices[11]; | |
shared_to_regstr_1_11_int(indices_shared, tid, indices); | |
float reg0[11]; | |
transfer_merge_values_regstr_1_11_float(count, b_start, indices, tid, a_global0, b_global0, reg0); | |
regstr_to_global_1_11_float(count, reg0, tid, dest_global0); | |
} | |
void device_merge_1_11_int_float | |
( | |
int a_count, | |
int b_count, | |
global const int * a_keys_global0, | |
global const int * b_keys_global0, | |
global int * keys_global0, | |
local int * keys_shared0, | |
global const float * a_vals_global0, | |
global const float * b_vals_global0, | |
global float * vals_global0, | |
int tid, | |
int block, | |
int4 range, | |
local int * indices_shared | |
) | |
{ | |
int results0[11]; | |
int indices[11]; | |
merge_keys_indices_1_11_int(a_count, b_count, range, tid, indices, a_keys_global0, b_keys_global0, keys_shared0, results0); | |
thread_to_shared_11_int(results0, tid, keys_shared0); | |
a_count = range.y - range.x; | |
b_count = range.w - range.z; | |
shared_to_global_1_11_int(a_count + b_count, keys_shared0, tid, keys_global0 + 11 * block); | |
thread_to_shared_11_int(indices, tid, indices_shared); | |
transfer_merge_values_shared_1_11_float(a_count + b_count, a_count, indices_shared, tid, a_vals_global0 + range.x, b_vals_global0 + range.z, vals_global0 + 11 * block); | |
} | |
kernel void merge | |
( | |
int a_count, | |
int b_count, | |
global const int * a_keys_global0, | |
global const int * b_keys_global0, | |
global int * keys_global0, | |
global const float * a_vals_global0, | |
global const float * b_vals_global0, | |
global float * vals_global0, | |
global const int * mp_global, | |
int coop | |
) | |
{ | |
union Shared | |
{ | |
struct | |
{ | |
int keys0[12]; | |
}; | |
int indices[11]; | |
}; | |
local union Shared shared; | |
int tid = get_local_id(0); | |
int block = get_group_id(0); | |
int4 range = compute_merge_range(a_count, b_count, block, coop, 11, mp_global); | |
device_merge_1_11_int_float(a_count, b_count, a_keys_global0, b_keys_global0, keys_global0, shared.keys0, a_vals_global0, b_vals_global0, vals_global0, tid, block, range, shared.indices); | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a, | |
int b | |
) | |
{ | |
char bit1 = 1 & a; char bit2 = 1 & b; if (bit1 == bit2) return a < b; return bit1 < bit2; | |
} | |
int merge_path_int | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
global const int * a0, | |
global const int * b0 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], a0[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
int4 find_mergesort_frame | |
( | |
int coop, | |
int block, | |
int nv | |
) | |
{ | |
int start = ~(coop - 1) & block; | |
int size = nv * (coop>> 1); | |
int4 frame; | |
frame.x = nv * start; | |
frame.y = nv * start + size; | |
frame.z = size; | |
return frame; | |
} | |
kernel void merge_partition | |
( | |
int a_count, | |
int b_count, | |
int nv, | |
int coop, | |
global int * mp_global, | |
int num_searches, | |
global const int * a_global0, | |
global const int * b_global0 | |
) | |
{ | |
int partition = get_global_id(0); | |
if (partition < num_searches) | |
{ | |
int a0 = 0, b0 = 0; | |
int gid = nv * partition; | |
if(coop) | |
{ | |
int4 frame = find_mergesort_frame(coop, partition, nv); | |
a0 = frame.x; | |
b0 = min(a_count, frame.y); | |
b_count = min(a_count, frame.y + frame.z) - b0; | |
a_count = min(a_count, frame.x + frame.z) - a0; | |
gid -= a0; | |
} | |
int mp = merge_path_int(a_count, b_count, min(gid, a_count + b_count), a_global0 + a0, b_global0 + b0); | |
mp_global[partition] = mp; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
float a2, | |
int b1, | |
float b2 | |
) | |
{ | |
return (a1 == b1) ? (a2 < b2) : (a1 < b1); | |
} | |
void global_to_regstr_pred_1_7_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
} | |
void global_to_regstr_1_7_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
if (count >= 7) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
} else global_to_regstr_pred_1_7_float(count, data, tid, reg); | |
} | |
void regstr_to_global_1_7_float | |
( | |
int count, | |
const float * reg, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_7_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_7_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_7_float | |
( | |
int count, | |
global const float * source, | |
int tid, | |
local float * dest | |
) | |
{ | |
float reg[7]; | |
global_to_regstr_1_7_float(count, source, tid, reg); | |
regstr_to_shared_1_7_float(reg, tid, dest); | |
} | |
void shared_to_global_1_7_float | |
( | |
int count, | |
local const float * source, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_7_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[7 * tid + 0]; | |
reg[1] = data[7 * tid + 1]; | |
reg[2] = data[7 * tid + 2]; | |
reg[3] = data[7 * tid + 3]; | |
reg[4] = data[7 * tid + 4]; | |
reg[5] = data[7 * tid + 5]; | |
reg[6] = data[7 * tid + 6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_7_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[7 * tid + 0] = reg[0]; | |
dest[7 * tid + 1] = reg[1]; | |
dest[7 * tid + 2] = reg[2]; | |
dest[7 * tid + 3] = reg[3]; | |
dest[7 * tid + 4] = reg[4]; | |
dest[7 * tid + 5] = reg[5]; | |
dest[7 * tid + 6] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_7_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
} | |
void global_to_regstr_1_7_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
if (count >= 7) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
} else global_to_regstr_pred_1_7_int(count, data, tid, reg); | |
} | |
void regstr_to_global_1_7_int | |
( | |
int count, | |
const int * reg, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_7_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_7_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_7_int | |
( | |
int count, | |
global const int * source, | |
int tid, | |
local int * dest | |
) | |
{ | |
int reg[7]; | |
global_to_regstr_1_7_int(count, source, tid, reg); | |
regstr_to_shared_1_7_int(reg, tid, dest); | |
} | |
void shared_to_global_1_7_int | |
( | |
int count, | |
local const int * source, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_7_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[7 * tid + 0]; | |
reg[1] = data[7 * tid + 1]; | |
reg[2] = data[7 * tid + 2]; | |
reg[3] = data[7 * tid + 3]; | |
reg[4] = data[7 * tid + 4]; | |
reg[5] = data[7 * tid + 5]; | |
reg[6] = data[7 * tid + 6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_7_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[7 * tid + 0] = reg[0]; | |
dest[7 * tid + 1] = reg[1]; | |
dest[7 * tid + 2] = reg[2]; | |
dest[7 * tid + 3] = reg[3]; | |
dest[7 * tid + 4] = reg[4]; | |
dest[7 * tid + 5] = reg[5]; | |
dest[7 * tid + 6] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void serial_merge_7_int_float | |
( | |
int a_begin, | |
int a_end, | |
int b_begin, | |
int b_end, | |
int * indices, | |
local const int * keys_shared0, | |
local const float * keys_shared1, | |
int * results0, | |
float * results1 | |
) | |
{ | |
int a_key0 = keys_shared0[a_begin]; | |
int b_key0 = keys_shared0[b_begin]; | |
float a_key1 = keys_shared1[a_begin]; | |
float b_key1 = keys_shared1[b_begin]; | |
bool p; | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[0] = p ? a_key0 : b_key0; | |
results1[0] = p ? a_key1 : b_key1; | |
indices[0] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[1] = p ? a_key0 : b_key0; | |
results1[1] = p ? a_key1 : b_key1; | |
indices[1] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[2] = p ? a_key0 : b_key0; | |
results1[2] = p ? a_key1 : b_key1; | |
indices[2] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[3] = p ? a_key0 : b_key0; | |
results1[3] = p ? a_key1 : b_key1; | |
indices[3] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[4] = p ? a_key0 : b_key0; | |
results1[4] = p ? a_key1 : b_key1; | |
indices[4] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[5] = p ? a_key0 : b_key0; | |
results1[5] = p ? a_key1 : b_key1; | |
indices[5] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[6] = p ? a_key0 : b_key0; | |
results1[6] = p ? a_key1 : b_key1; | |
indices[6] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void swap_int_float | |
( | |
int * a0, | |
float * a1, | |
int * b0, | |
float * b1 | |
) | |
{ | |
{ | |
int c = *a0; | |
*a0 = *b0; | |
*b0 = c; | |
} | |
{ | |
float c = *a1; | |
*a1 = *b1; | |
*b1 = c; | |
} | |
} | |
void odd_even_transpose_sort_7_int_float | |
( | |
int * keys0, | |
float * keys1 | |
) | |
{ | |
if (comp(keys0[1], keys1[1], keys0[0], keys1[0])) | |
{ | |
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1); | |
} | |
if (comp(keys0[3], keys1[3], keys0[2], keys1[2])) | |
{ | |
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3); | |
} | |
if (comp(keys0[5], keys1[5], keys0[4], keys1[4])) | |
{ | |
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5); | |
} | |
if (comp(keys0[2], keys1[2], keys0[1], keys1[1])) | |
{ | |
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2); | |
} | |
if (comp(keys0[4], keys1[4], keys0[3], keys1[3])) | |
{ | |
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4); | |
} | |
if (comp(keys0[6], keys1[6], keys0[5], keys1[5])) | |
{ | |
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6); | |
} | |
if (comp(keys0[1], keys1[1], keys0[0], keys1[0])) | |
{ | |
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1); | |
} | |
if (comp(keys0[3], keys1[3], keys0[2], keys1[2])) | |
{ | |
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3); | |
} | |
if (comp(keys0[5], keys1[5], keys0[4], keys1[4])) | |
{ | |
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5); | |
} | |
if (comp(keys0[2], keys1[2], keys0[1], keys1[1])) | |
{ | |
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2); | |
} | |
if (comp(keys0[4], keys1[4], keys0[3], keys1[3])) | |
{ | |
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4); | |
} | |
if (comp(keys0[6], keys1[6], keys0[5], keys1[5])) | |
{ | |
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6); | |
} | |
if (comp(keys0[1], keys1[1], keys0[0], keys1[0])) | |
{ | |
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1); | |
} | |
if (comp(keys0[3], keys1[3], keys0[2], keys1[2])) | |
{ | |
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3); | |
} | |
if (comp(keys0[5], keys1[5], keys0[4], keys1[4])) | |
{ | |
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5); | |
} | |
if (comp(keys0[2], keys1[2], keys0[1], keys1[1])) | |
{ | |
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2); | |
} | |
if (comp(keys0[4], keys1[4], keys0[3], keys1[3])) | |
{ | |
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4); | |
} | |
if (comp(keys0[6], keys1[6], keys0[5], keys1[5])) | |
{ | |
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6); | |
} | |
if (comp(keys0[1], keys1[1], keys0[0], keys1[0])) | |
{ | |
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1); | |
} | |
if (comp(keys0[3], keys1[3], keys0[2], keys1[2])) | |
{ | |
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3); | |
} | |
if (comp(keys0[5], keys1[5], keys0[4], keys1[4])) | |
{ | |
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5); | |
} | |
} | |
int merge_path_int_float | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
local const int * a0, | |
local const float * a1, | |
local const int * b0, | |
local const float * b1 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
void block_sort_pass_1_7_int_float | |
( | |
int tid, | |
int count, | |
int coop, | |
int * indices, | |
local const int * keys_shared0, | |
local const float * keys_shared1, | |
int * keys0, | |
float * keys1 | |
) | |
{ | |
int list = ~(coop - 1) & tid; | |
int diag = min(count, 7 * ((coop - 1) & tid)); | |
int start = 7 * list; | |
int a0 = min(count, start); | |
int b0 = min(count, start + 7 * (coop / 2)); | |
int b1 = min(count, start + 7 * coop); | |
int p = merge_path_int_float(b0 - a0, b1 - b0, diag, keys_shared0 + a0, keys_shared1 + a0, keys_shared0 + b0, keys_shared1 + b0); | |
serial_merge_7_int_float(a0 + p, b0, b0 + diag - p, b1, indices, keys_shared0, keys_shared1, keys0, keys1); | |
} | |
void block_sort_loop_1_7_int_float | |
( | |
int tid, | |
int count, | |
local int * keys_shared0, | |
local float * keys_shared1 | |
) | |
{ | |
int indices[7]; | |
int keys0[7]; | |
float keys1[7]; | |
} | |
void mergesort_1_7_int_float | |
( | |
int count, | |
int tid, | |
int * thread_keys0, | |
float * thread_keys1, | |
local int * keys_shared0, | |
local float * keys_shared1 | |
) | |
{ | |
if(7 * tid < count) odd_even_transpose_sort_7_int_float(thread_keys0, thread_keys1); | |
thread_to_shared_7_int(thread_keys0, tid, keys_shared0); | |
thread_to_shared_7_float(thread_keys1, tid, keys_shared1); | |
block_sort_loop_1_7_int_float(tid, count, keys_shared0, keys_shared1); | |
} | |
kernel void block_sort | |
( | |
int count, | |
global const int * keys_src0, | |
global const float * keys_src1, | |
global int * keys_dst0, | |
global float * keys_dst1 | |
) | |
{ | |
union Shared | |
{ | |
struct | |
{ | |
int keys0[8]; | |
float keys1[8]; | |
}; | |
}; | |
local union Shared shared; | |
int tid = get_local_id(0); | |
int block = get_group_id(0); | |
int gid = 7 * block; | |
int count2 = min(7, count - gid); | |
int thread_keys0[7]; | |
float thread_keys1[7]; | |
global_to_shared_1_7_int(count2, keys_src0 + gid, tid, shared.keys0); | |
global_to_shared_1_7_float(count2, keys_src1 + gid, tid, shared.keys1); | |
shared_to_thread_7_int(shared.keys0, tid, thread_keys0); | |
shared_to_thread_7_float(shared.keys1, tid, thread_keys1); | |
int first = 7 * tid; | |
if(first + 7 > count2 && first < count2) | |
{ | |
int max_key0 = thread_keys0[0]; | |
float max_key1 = thread_keys1[0]; | |
if(first + 1 < count2 && comp(max_key0, max_key1, thread_keys0[1], thread_keys1[1]) ) | |
{ | |
max_key0 = thread_keys0[1]; | |
max_key1 = thread_keys1[1]; | |
} | |
if(first + 2 < count2 && comp(max_key0, max_key1, thread_keys0[2], thread_keys1[2]) ) | |
{ | |
max_key0 = thread_keys0[2]; | |
max_key1 = thread_keys1[2]; | |
} | |
if(first + 3 < count2 && comp(max_key0, max_key1, thread_keys0[3], thread_keys1[3]) ) | |
{ | |
max_key0 = thread_keys0[3]; | |
max_key1 = thread_keys1[3]; | |
} | |
if(first + 4 < count2 && comp(max_key0, max_key1, thread_keys0[4], thread_keys1[4]) ) | |
{ | |
max_key0 = thread_keys0[4]; | |
max_key1 = thread_keys1[4]; | |
} | |
if(first + 5 < count2 && comp(max_key0, max_key1, thread_keys0[5], thread_keys1[5]) ) | |
{ | |
max_key0 = thread_keys0[5]; | |
max_key1 = thread_keys1[5]; | |
} | |
if(first + 6 < count2 && comp(max_key0, max_key1, thread_keys0[6], thread_keys1[6]) ) | |
{ | |
max_key0 = thread_keys0[6]; | |
max_key1 = thread_keys1[6]; | |
} | |
if(first + 0 >= count2) | |
{ | |
thread_keys0[0] = max_key0; | |
thread_keys1[0] = max_key1; | |
} | |
if(first + 1 >= count2) | |
{ | |
thread_keys0[1] = max_key0; | |
thread_keys1[1] = max_key1; | |
} | |
if(first + 2 >= count2) | |
{ | |
thread_keys0[2] = max_key0; | |
thread_keys1[2] = max_key1; | |
} | |
if(first + 3 >= count2) | |
{ | |
thread_keys0[3] = max_key0; | |
thread_keys1[3] = max_key1; | |
} | |
if(first + 4 >= count2) | |
{ | |
thread_keys0[4] = max_key0; | |
thread_keys1[4] = max_key1; | |
} | |
if(first + 5 >= count2) | |
{ | |
thread_keys0[5] = max_key0; | |
thread_keys1[5] = max_key1; | |
} | |
if(first + 6 >= count2) | |
{ | |
thread_keys0[6] = max_key0; | |
thread_keys1[6] = max_key1; | |
} | |
} | |
mergesort_1_7_int_float(count2, tid, thread_keys0, thread_keys1, shared.keys0, shared.keys1); | |
shared_to_global_1_7_int(count2, shared.keys0, tid, keys_dst0 + gid); | |
shared_to_global_1_7_float(count2, shared.keys1, tid, keys_dst1 + gid); | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
float a2, | |
int b1, | |
float b2 | |
) | |
{ | |
return (a1 == b1) ? (a2 < b2) : (a1 < b1); | |
} | |
int4 find_mergesort_frame | |
( | |
int coop, | |
int block, | |
int nv | |
) | |
{ | |
int start = ~(coop - 1) & block; | |
int size = nv * (coop>> 1); | |
int4 frame; | |
frame.x = nv * start; | |
frame.y = nv * start + size; | |
frame.z = size; | |
return frame; | |
} | |
int4 find_mergesort_interval | |
( | |
int4 frame, | |
int coop, | |
int block, | |
int nv, | |
int count, | |
int mp0, | |
int mp1 | |
) | |
{ | |
int diag = nv * block - frame.x; | |
int4 interval; | |
interval.x = frame.x + mp0; | |
interval.y = min(count, frame.x + mp1); | |
interval.z = min(count, frame.y + diag - mp0); | |
interval.w = min(count, frame.y + diag + nv - mp1); | |
if(coop - 1 == ((coop - 1) & block)) | |
{ | |
interval.y = min(count, frame.x + frame.z); | |
interval.w = min(count, frame.y + frame.z); | |
} | |
return interval; | |
} | |
int4 compute_merge_range | |
( | |
int a_count, | |
int b_count, | |
int block, | |
int coop, | |
int nv, | |
global const int * mp_global | |
) | |
{ | |
int mp0 = mp_global[block]; | |
int mp1 = mp_global[block + 1]; | |
int gid = nv * block; | |
int4 range; | |
if(coop) | |
{ | |
int4 frame = find_mergesort_frame(coop, block, nv); | |
range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1); | |
} | |
else | |
{ | |
range.x = mp0; | |
range.y = mp1; | |
range.z = gid - range.x; | |
range.w = min(a_count + b_count, gid + nv) - range.y; | |
} | |
return range; | |
} | |
void global_to_regstr_pred_1_7_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
} | |
void global_to_regstr_1_7_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
if (count >= 7) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
} else global_to_regstr_pred_1_7_float(count, data, tid, reg); | |
} | |
void regstr_to_global_1_7_float | |
( | |
int count, | |
const float * reg, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_7_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_7_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_7_float | |
( | |
int count, | |
global const float * source, | |
int tid, | |
local float * dest | |
) | |
{ | |
float reg[7]; | |
global_to_regstr_1_7_float(count, source, tid, reg); | |
regstr_to_shared_1_7_float(reg, tid, dest); | |
} | |
void shared_to_global_1_7_float | |
( | |
int count, | |
local const float * source, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_7_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[7 * tid + 0]; | |
reg[1] = data[7 * tid + 1]; | |
reg[2] = data[7 * tid + 2]; | |
reg[3] = data[7 * tid + 3]; | |
reg[4] = data[7 * tid + 4]; | |
reg[5] = data[7 * tid + 5]; | |
reg[6] = data[7 * tid + 6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_7_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[7 * tid + 0] = reg[0]; | |
dest[7 * tid + 1] = reg[1]; | |
dest[7 * tid + 2] = reg[2]; | |
dest[7 * tid + 3] = reg[3]; | |
dest[7 * tid + 4] = reg[4]; | |
dest[7 * tid + 5] = reg[5]; | |
dest[7 * tid + 6] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_7_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
} | |
void global_to_regstr_1_7_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
if (count >= 7) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
} else global_to_regstr_pred_1_7_int(count, data, tid, reg); | |
} | |
void regstr_to_global_1_7_int | |
( | |
int count, | |
const int * reg, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_7_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_7_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_7_int | |
( | |
int count, | |
global const int * source, | |
int tid, | |
local int * dest | |
) | |
{ | |
int reg[7]; | |
global_to_regstr_1_7_int(count, source, tid, reg); | |
regstr_to_shared_1_7_int(reg, tid, dest); | |
} | |
void shared_to_global_1_7_int | |
( | |
int count, | |
local const int * source, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_7_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[7 * tid + 0]; | |
reg[1] = data[7 * tid + 1]; | |
reg[2] = data[7 * tid + 2]; | |
reg[3] = data[7 * tid + 3]; | |
reg[4] = data[7 * tid + 4]; | |
reg[5] = data[7 * tid + 5]; | |
reg[6] = data[7 * tid + 6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_7_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[7 * tid + 0] = reg[0]; | |
dest[7 * tid + 1] = reg[1]; | |
dest[7 * tid + 2] = reg[2]; | |
dest[7 * tid + 3] = reg[3]; | |
dest[7 * tid + 4] = reg[4]; | |
dest[7 * tid + 5] = reg[5]; | |
dest[7 * tid + 6] = reg[6]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void serial_merge_7_int_float | |
( | |
int a_begin, | |
int a_end, | |
int b_begin, | |
int b_end, | |
int * indices, | |
local const int * keys_shared0, | |
local const float * keys_shared1, | |
int * results0, | |
float * results1 | |
) | |
{ | |
int a_key0 = keys_shared0[a_begin]; | |
int b_key0 = keys_shared0[b_begin]; | |
float a_key1 = keys_shared1[a_begin]; | |
float b_key1 = keys_shared1[b_begin]; | |
bool p; | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[0] = p ? a_key0 : b_key0; | |
results1[0] = p ? a_key1 : b_key1; | |
indices[0] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[1] = p ? a_key0 : b_key0; | |
results1[1] = p ? a_key1 : b_key1; | |
indices[1] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[2] = p ? a_key0 : b_key0; | |
results1[2] = p ? a_key1 : b_key1; | |
indices[2] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[3] = p ? a_key0 : b_key0; | |
results1[3] = p ? a_key1 : b_key1; | |
indices[3] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[4] = p ? a_key0 : b_key0; | |
results1[4] = p ? a_key1 : b_key1; | |
indices[4] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[5] = p ? a_key0 : b_key0; | |
results1[5] = p ? a_key1 : b_key1; | |
indices[5] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[6] = p ? a_key0 : b_key0; | |
results1[6] = p ? a_key1 : b_key1; | |
indices[6] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
int merge_path_int_float | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
local const int * a0, | |
local const float * a1, | |
local const int * b0, | |
local const float * b1 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
void load2_to_regstr_1_7_7_float | |
( | |
global const float * a_global, | |
int a_count, | |
global const float * b_global, | |
int b_count, | |
int tid, | |
float * reg | |
) | |
{ | |
b_global -= a_count; | |
int total = a_count + b_count; | |
int index; | |
if (total >= 7) | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else reg[6] = b_global[index]; | |
} | |
else | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else if (index < total) reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else if (index < total) reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else if (index < total) reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else if (index < total) reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else if (index < total) reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else if (index < total) reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else if (index < total) reg[6] = b_global[index]; | |
} | |
} | |
void load2_to_shared_1_7_7_float | |
( | |
global const float * a_global, | |
int a_count, | |
global const float * b_global, | |
int b_count, | |
int tid, | |
local float * shared | |
) | |
{ | |
float reg[7]; | |
load2_to_regstr_1_7_7_float(a_global, a_count, b_global, b_count, tid, reg); | |
regstr_to_shared_1_7_float(reg, tid, shared); | |
} | |
void load2_to_regstr_1_7_7_int | |
( | |
global const int * a_global, | |
int a_count, | |
global const int * b_global, | |
int b_count, | |
int tid, | |
int * reg | |
) | |
{ | |
b_global -= a_count; | |
int total = a_count + b_count; | |
int index; | |
if (total >= 7) | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else reg[6] = b_global[index]; | |
} | |
else | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else if (index < total) reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else if (index < total) reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else if (index < total) reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else if (index < total) reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else if (index < total) reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else if (index < total) reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else if (index < total) reg[6] = b_global[index]; | |
} | |
} | |
void load2_to_shared_1_7_7_int | |
( | |
global const int * a_global, | |
int a_count, | |
global const int * b_global, | |
int b_count, | |
int tid, | |
local int * shared | |
) | |
{ | |
int reg[7]; | |
load2_to_regstr_1_7_7_int(a_global, a_count, b_global, b_count, tid, reg); | |
regstr_to_shared_1_7_int(reg, tid, shared); | |
} | |
void merge_keys_indices_1_7_int_float | |
( | |
int a_count, | |
int b_count, | |
int4 range, | |
int tid, | |
int * indices, | |
global const int * a_global0, | |
global const float * a_global1, | |
global const int * b_global0, | |
global const float * b_global1, | |
local int * keys_shared0, | |
local float * keys_shared1, | |
int * results0, | |
float * results1 | |
) | |
{ | |
int a0 = range.x; | |
int a1 = range.y; | |
int b0 = range.z; | |
int b1 = range.w; | |
a_count = a1 - a0; | |
b_count = b1 - b0; | |
load2_to_shared_1_7_7_int(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0); | |
load2_to_shared_1_7_7_float(a_global1 + a0, a_count, b_global1 + b0, b_count, tid, keys_shared1); | |
int diag = 7 * tid; | |
int mp = merge_path_int_float(a_count, b_count, diag, keys_shared0, keys_shared1, keys_shared0 + a_count, keys_shared1 + a_count); | |
int a0tid = mp; | |
int a1tid = a_count; | |
int b0tid = a_count + diag - mp; | |
int b1tid = a_count + b_count; | |
serial_merge_7_int_float(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, keys_shared1, results0, results1); | |
} | |
void device_merge_1_7_int_float | |
( | |
int a_count, | |
int b_count, | |
global const int * a_keys_global0, | |
global const float * a_keys_global1, | |
global const int * b_keys_global0, | |
global const float * b_keys_global1, | |
global int * keys_global0, | |
global float * keys_global1, | |
local int * keys_shared0, | |
local float * keys_shared1, | |
int tid, | |
int block, | |
int4 range, | |
local int * indices_shared | |
) | |
{ | |
int results0[7]; | |
float results1[7]; | |
int indices[7]; | |
merge_keys_indices_1_7_int_float(a_count, b_count, range, tid, indices, a_keys_global0, a_keys_global1, b_keys_global0, b_keys_global1, keys_shared0, keys_shared1, results0, results1); | |
thread_to_shared_7_int(results0, tid, keys_shared0); | |
thread_to_shared_7_float(results1, tid, keys_shared1); | |
a_count = range.y - range.x; | |
b_count = range.w - range.z; | |
shared_to_global_1_7_int(a_count + b_count, keys_shared0, tid, keys_global0 + 7 * block); | |
shared_to_global_1_7_float(a_count + b_count, keys_shared1, tid, keys_global1 + 7 * block); | |
} | |
kernel void merge | |
( | |
int a_count, | |
int b_count, | |
global const int * a_keys_global0, | |
global const float * a_keys_global1, | |
global const int * b_keys_global0, | |
global const float * b_keys_global1, | |
global int * keys_global0, | |
global float * keys_global1, | |
global const int * mp_global, | |
int coop | |
) | |
{ | |
union Shared | |
{ | |
struct | |
{ | |
int keys0[8]; | |
float keys1[8]; | |
}; | |
int indices[7]; | |
}; | |
local union Shared shared; | |
int tid = get_local_id(0); | |
int block = get_group_id(0); | |
int4 range = compute_merge_range(a_count, b_count, block, coop, 7, mp_global); | |
device_merge_1_7_int_float(a_count, b_count, a_keys_global0, a_keys_global1, b_keys_global0, b_keys_global1, keys_global0, keys_global1, shared.keys0, shared.keys1, tid, block, range, shared.indices); | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
float a2, | |
int b1, | |
float b2 | |
) | |
{ | |
return (a1 == b1) ? (a2 < b2) : (a1 < b1); | |
} | |
int merge_path_int_float | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
global const int * a0, | |
global const float * a1, | |
global const int * b0, | |
global const float * b1 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
int4 find_mergesort_frame | |
( | |
int coop, | |
int block, | |
int nv | |
) | |
{ | |
int start = ~(coop - 1) & block; | |
int size = nv * (coop>> 1); | |
int4 frame; | |
frame.x = nv * start; | |
frame.y = nv * start + size; | |
frame.z = size; | |
return frame; | |
} | |
kernel void merge_partition | |
( | |
int a_count, | |
int b_count, | |
int nv, | |
int coop, | |
global int * mp_global, | |
int num_searches, | |
global const int * a_global0, | |
global const float * a_global1, | |
global const int * b_global0, | |
global const float * b_global1 | |
) | |
{ | |
int partition = get_global_id(0); | |
if (partition < num_searches) | |
{ | |
int a0 = 0, b0 = 0; | |
int gid = nv * partition; | |
if(coop) | |
{ | |
int4 frame = find_mergesort_frame(coop, partition, nv); | |
a0 = frame.x; | |
b0 = min(a_count, frame.y); | |
b_count = min(a_count, frame.y + frame.z) - b0; | |
a_count = min(a_count, frame.x + frame.z) - a0; | |
gid -= a0; | |
} | |
int mp = merge_path_int_float(a_count, b_count, min(gid, a_count + b_count), a_global0 + a0, a_global1 + a0, b_global0 + b0, b_global1 + b0); | |
mp_global[partition] = mp; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
float a2, | |
int b1, | |
float b2 | |
) | |
{ | |
return (a1 == b1) ? (a2 < b2) : (a1 < b1); | |
} | |
void global_to_regstr_pred_1_11_short | |
( | |
int count, | |
global const short * data, | |
int tid, | |
short * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_short | |
( | |
int count, | |
global const short * data, | |
int tid, | |
short * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_short(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_short | |
( | |
int count, | |
const short * reg, | |
int tid, | |
global short * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_short | |
( | |
local const short * data, | |
int tid, | |
short * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_short | |
( | |
const short * reg, | |
int tid, | |
local short * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_short | |
( | |
int count, | |
global const short * source, | |
int tid, | |
local short * dest | |
) | |
{ | |
short reg[11]; | |
global_to_regstr_1_11_short(count, source, tid, reg); | |
regstr_to_shared_1_11_short(reg, tid, dest); | |
} | |
void shared_to_global_1_11_short | |
( | |
int count, | |
local const short * source, | |
int tid, | |
global short * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_short | |
( | |
local const short * data, | |
int tid, | |
short * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_short | |
( | |
const short * reg, | |
int tid, | |
local short * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_long | |
( | |
int count, | |
global const long * data, | |
int tid, | |
long * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_long | |
( | |
int count, | |
global const long * data, | |
int tid, | |
long * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_long(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_long | |
( | |
int count, | |
const long * reg, | |
int tid, | |
global long * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_long | |
( | |
local const long * data, | |
int tid, | |
long * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_long | |
( | |
const long * reg, | |
int tid, | |
local long * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_long | |
( | |
int count, | |
global const long * source, | |
int tid, | |
local long * dest | |
) | |
{ | |
long reg[11]; | |
global_to_regstr_1_11_long(count, source, tid, reg); | |
regstr_to_shared_1_11_long(reg, tid, dest); | |
} | |
void shared_to_global_1_11_long | |
( | |
int count, | |
local const long * source, | |
int tid, | |
global long * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_long | |
( | |
local const long * data, | |
int tid, | |
long * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_long | |
( | |
const long * reg, | |
int tid, | |
local long * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_float(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_float | |
( | |
int count, | |
const float * reg, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_float | |
( | |
int count, | |
global const float * source, | |
int tid, | |
local float * dest | |
) | |
{ | |
float reg[11]; | |
global_to_regstr_1_11_float(count, source, tid, reg); | |
regstr_to_shared_1_11_float(reg, tid, dest); | |
} | |
void shared_to_global_1_11_float | |
( | |
int count, | |
local const float * source, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_int(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_int | |
( | |
int count, | |
const int * reg, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_int | |
( | |
int count, | |
global const int * source, | |
int tid, | |
local int * dest | |
) | |
{ | |
int reg[11]; | |
global_to_regstr_1_11_int(count, source, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, dest); | |
} | |
void shared_to_global_1_11_int | |
( | |
int count, | |
local const int * source, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void serial_merge_11_int_float | |
( | |
int a_begin, | |
int a_end, | |
int b_begin, | |
int b_end, | |
int * indices, | |
local const int * keys_shared0, | |
local const float * keys_shared1, | |
int * results0, | |
float * results1 | |
) | |
{ | |
int a_key0 = keys_shared0[a_begin]; | |
int b_key0 = keys_shared0[b_begin]; | |
float a_key1 = keys_shared1[a_begin]; | |
float b_key1 = keys_shared1[b_begin]; | |
bool p; | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[0] = p ? a_key0 : b_key0; | |
results1[0] = p ? a_key1 : b_key1; | |
indices[0] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[1] = p ? a_key0 : b_key0; | |
results1[1] = p ? a_key1 : b_key1; | |
indices[1] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[2] = p ? a_key0 : b_key0; | |
results1[2] = p ? a_key1 : b_key1; | |
indices[2] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[3] = p ? a_key0 : b_key0; | |
results1[3] = p ? a_key1 : b_key1; | |
indices[3] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[4] = p ? a_key0 : b_key0; | |
results1[4] = p ? a_key1 : b_key1; | |
indices[4] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[5] = p ? a_key0 : b_key0; | |
results1[5] = p ? a_key1 : b_key1; | |
indices[5] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[6] = p ? a_key0 : b_key0; | |
results1[6] = p ? a_key1 : b_key1; | |
indices[6] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[7] = p ? a_key0 : b_key0; | |
results1[7] = p ? a_key1 : b_key1; | |
indices[7] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[8] = p ? a_key0 : b_key0; | |
results1[8] = p ? a_key1 : b_key1; | |
indices[8] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[9] = p ? a_key0 : b_key0; | |
results1[9] = p ? a_key1 : b_key1; | |
indices[9] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[10] = p ? a_key0 : b_key0; | |
results1[10] = p ? a_key1 : b_key1; | |
indices[10] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void swap_int_float | |
( | |
int * a0, | |
float * a1, | |
int * b0, | |
float * b1 | |
) | |
{ | |
{ | |
int c = *a0; | |
*a0 = *b0; | |
*b0 = c; | |
} | |
{ | |
float c = *a1; | |
*a1 = *b1; | |
*b1 = c; | |
} | |
} | |
void swap_long_short | |
( | |
long * a0, | |
short * a1, | |
long * b0, | |
short * b1 | |
) | |
{ | |
{ | |
long c = *a0; | |
*a0 = *b0; | |
*b0 = c; | |
} | |
{ | |
short c = *a1; | |
*a1 = *b1; | |
*b1 = c; | |
} | |
} | |
void odd_even_transpose_sort_11_int_float_long_short | |
( | |
int * keys0, | |
float * keys1, | |
long * vals0, | |
short * vals1 | |
) | |
{ | |
if (comp(keys0[1], keys1[1], keys0[0], keys1[0])) | |
{ | |
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1); | |
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1); | |
} | |
if (comp(keys0[3], keys1[3], keys0[2], keys1[2])) | |
{ | |
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3); | |
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3); | |
} | |
if (comp(keys0[5], keys1[5], keys0[4], keys1[4])) | |
{ | |
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5); | |
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5); | |
} | |
if (comp(keys0[7], keys1[7], keys0[6], keys1[6])) | |
{ | |
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7); | |
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7); | |
} | |
if (comp(keys0[9], keys1[9], keys0[8], keys1[8])) | |
{ | |
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9); | |
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9); | |
} | |
if (comp(keys0[2], keys1[2], keys0[1], keys1[1])) | |
{ | |
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2); | |
swap_long_short(vals0 + 1, vals1 + 1, vals0 + 2, vals1 + 2); | |
} | |
if (comp(keys0[4], keys1[4], keys0[3], keys1[3])) | |
{ | |
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4); | |
swap_long_short(vals0 + 3, vals1 + 3, vals0 + 4, vals1 + 4); | |
} | |
if (comp(keys0[6], keys1[6], keys0[5], keys1[5])) | |
{ | |
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6); | |
swap_long_short(vals0 + 5, vals1 + 5, vals0 + 6, vals1 + 6); | |
} | |
if (comp(keys0[8], keys1[8], keys0[7], keys1[7])) | |
{ | |
swap_int_float(keys0 + 7, keys1 + 7, keys0 + 8, keys1 + 8); | |
swap_long_short(vals0 + 7, vals1 + 7, vals0 + 8, vals1 + 8); | |
} | |
if (comp(keys0[10], keys1[10], keys0[9], keys1[9])) | |
{ | |
swap_int_float(keys0 + 9, keys1 + 9, keys0 + 10, keys1 + 10); | |
swap_long_short(vals0 + 9, vals1 + 9, vals0 + 10, vals1 + 10); | |
} | |
if (comp(keys0[1], keys1[1], keys0[0], keys1[0])) | |
{ | |
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1); | |
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1); | |
} | |
if (comp(keys0[3], keys1[3], keys0[2], keys1[2])) | |
{ | |
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3); | |
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3); | |
} | |
if (comp(keys0[5], keys1[5], keys0[4], keys1[4])) | |
{ | |
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5); | |
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5); | |
} | |
if (comp(keys0[7], keys1[7], keys0[6], keys1[6])) | |
{ | |
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7); | |
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7); | |
} | |
if (comp(keys0[9], keys1[9], keys0[8], keys1[8])) | |
{ | |
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9); | |
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9); | |
} | |
if (comp(keys0[2], keys1[2], keys0[1], keys1[1])) | |
{ | |
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2); | |
swap_long_short(vals0 + 1, vals1 + 1, vals0 + 2, vals1 + 2); | |
} | |
if (comp(keys0[4], keys1[4], keys0[3], keys1[3])) | |
{ | |
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4); | |
swap_long_short(vals0 + 3, vals1 + 3, vals0 + 4, vals1 + 4); | |
} | |
if (comp(keys0[6], keys1[6], keys0[5], keys1[5])) | |
{ | |
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6); | |
swap_long_short(vals0 + 5, vals1 + 5, vals0 + 6, vals1 + 6); | |
} | |
if (comp(keys0[8], keys1[8], keys0[7], keys1[7])) | |
{ | |
swap_int_float(keys0 + 7, keys1 + 7, keys0 + 8, keys1 + 8); | |
swap_long_short(vals0 + 7, vals1 + 7, vals0 + 8, vals1 + 8); | |
} | |
if (comp(keys0[10], keys1[10], keys0[9], keys1[9])) | |
{ | |
swap_int_float(keys0 + 9, keys1 + 9, keys0 + 10, keys1 + 10); | |
swap_long_short(vals0 + 9, vals1 + 9, vals0 + 10, vals1 + 10); | |
} | |
if (comp(keys0[1], keys1[1], keys0[0], keys1[0])) | |
{ | |
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1); | |
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1); | |
} | |
if (comp(keys0[3], keys1[3], keys0[2], keys1[2])) | |
{ | |
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3); | |
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3); | |
} | |
if (comp(keys0[5], keys1[5], keys0[4], keys1[4])) | |
{ | |
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5); | |
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5); | |
} | |
if (comp(keys0[7], keys1[7], keys0[6], keys1[6])) | |
{ | |
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7); | |
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7); | |
} | |
if (comp(keys0[9], keys1[9], keys0[8], keys1[8])) | |
{ | |
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9); | |
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9); | |
} | |
if (comp(keys0[2], keys1[2], keys0[1], keys1[1])) | |
{ | |
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2); | |
swap_long_short(vals0 + 1, vals1 + 1, vals0 + 2, vals1 + 2); | |
} | |
if (comp(keys0[4], keys1[4], keys0[3], keys1[3])) | |
{ | |
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4); | |
swap_long_short(vals0 + 3, vals1 + 3, vals0 + 4, vals1 + 4); | |
} | |
if (comp(keys0[6], keys1[6], keys0[5], keys1[5])) | |
{ | |
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6); | |
swap_long_short(vals0 + 5, vals1 + 5, vals0 + 6, vals1 + 6); | |
} | |
if (comp(keys0[8], keys1[8], keys0[7], keys1[7])) | |
{ | |
swap_int_float(keys0 + 7, keys1 + 7, keys0 + 8, keys1 + 8); | |
swap_long_short(vals0 + 7, vals1 + 7, vals0 + 8, vals1 + 8); | |
} | |
if (comp(keys0[10], keys1[10], keys0[9], keys1[9])) | |
{ | |
swap_int_float(keys0 + 9, keys1 + 9, keys0 + 10, keys1 + 10); | |
swap_long_short(vals0 + 9, vals1 + 9, vals0 + 10, vals1 + 10); | |
} | |
if (comp(keys0[1], keys1[1], keys0[0], keys1[0])) | |
{ | |
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1); | |
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1); | |
} | |
if (comp(keys0[3], keys1[3], keys0[2], keys1[2])) | |
{ | |
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3); | |
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3); | |
} | |
if (comp(keys0[5], keys1[5], keys0[4], keys1[4])) | |
{ | |
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5); | |
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5); | |
} | |
if (comp(keys0[7], keys1[7], keys0[6], keys1[6])) | |
{ | |
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7); | |
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7); | |
} | |
if (comp(keys0[9], keys1[9], keys0[8], keys1[8])) | |
{ | |
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9); | |
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9); | |
} | |
if (comp(keys0[2], keys1[2], keys0[1], keys1[1])) | |
{ | |
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2); | |
swap_long_short(vals0 + 1, vals1 + 1, vals0 + 2, vals1 + 2); | |
} | |
if (comp(keys0[4], keys1[4], keys0[3], keys1[3])) | |
{ | |
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4); | |
swap_long_short(vals0 + 3, vals1 + 3, vals0 + 4, vals1 + 4); | |
} | |
if (comp(keys0[6], keys1[6], keys0[5], keys1[5])) | |
{ | |
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6); | |
swap_long_short(vals0 + 5, vals1 + 5, vals0 + 6, vals1 + 6); | |
} | |
if (comp(keys0[8], keys1[8], keys0[7], keys1[7])) | |
{ | |
swap_int_float(keys0 + 7, keys1 + 7, keys0 + 8, keys1 + 8); | |
swap_long_short(vals0 + 7, vals1 + 7, vals0 + 8, vals1 + 8); | |
} | |
if (comp(keys0[10], keys1[10], keys0[9], keys1[9])) | |
{ | |
swap_int_float(keys0 + 9, keys1 + 9, keys0 + 10, keys1 + 10); | |
swap_long_short(vals0 + 9, vals1 + 9, vals0 + 10, vals1 + 10); | |
} | |
if (comp(keys0[1], keys1[1], keys0[0], keys1[0])) | |
{ | |
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1); | |
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1); | |
} | |
if (comp(keys0[3], keys1[3], keys0[2], keys1[2])) | |
{ | |
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3); | |
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3); | |
} | |
if (comp(keys0[5], keys1[5], keys0[4], keys1[4])) | |
{ | |
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5); | |
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5); | |
} | |
if (comp(keys0[7], keys1[7], keys0[6], keys1[6])) | |
{ | |
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7); | |
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7); | |
} | |
if (comp(keys0[9], keys1[9], keys0[8], keys1[8])) | |
{ | |
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9); | |
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9); | |
} | |
if (comp(keys0[2], keys1[2], keys0[1], keys1[1])) | |
{ | |
swap_int_float(keys0 + 1, keys1 + 1, keys0 + 2, keys1 + 2); | |
swap_long_short(vals0 + 1, vals1 + 1, vals0 + 2, vals1 + 2); | |
} | |
if (comp(keys0[4], keys1[4], keys0[3], keys1[3])) | |
{ | |
swap_int_float(keys0 + 3, keys1 + 3, keys0 + 4, keys1 + 4); | |
swap_long_short(vals0 + 3, vals1 + 3, vals0 + 4, vals1 + 4); | |
} | |
if (comp(keys0[6], keys1[6], keys0[5], keys1[5])) | |
{ | |
swap_int_float(keys0 + 5, keys1 + 5, keys0 + 6, keys1 + 6); | |
swap_long_short(vals0 + 5, vals1 + 5, vals0 + 6, vals1 + 6); | |
} | |
if (comp(keys0[8], keys1[8], keys0[7], keys1[7])) | |
{ | |
swap_int_float(keys0 + 7, keys1 + 7, keys0 + 8, keys1 + 8); | |
swap_long_short(vals0 + 7, vals1 + 7, vals0 + 8, vals1 + 8); | |
} | |
if (comp(keys0[10], keys1[10], keys0[9], keys1[9])) | |
{ | |
swap_int_float(keys0 + 9, keys1 + 9, keys0 + 10, keys1 + 10); | |
swap_long_short(vals0 + 9, vals1 + 9, vals0 + 10, vals1 + 10); | |
} | |
if (comp(keys0[1], keys1[1], keys0[0], keys1[0])) | |
{ | |
swap_int_float(keys0 + 0, keys1 + 0, keys0 + 1, keys1 + 1); | |
swap_long_short(vals0 + 0, vals1 + 0, vals0 + 1, vals1 + 1); | |
} | |
if (comp(keys0[3], keys1[3], keys0[2], keys1[2])) | |
{ | |
swap_int_float(keys0 + 2, keys1 + 2, keys0 + 3, keys1 + 3); | |
swap_long_short(vals0 + 2, vals1 + 2, vals0 + 3, vals1 + 3); | |
} | |
if (comp(keys0[5], keys1[5], keys0[4], keys1[4])) | |
{ | |
swap_int_float(keys0 + 4, keys1 + 4, keys0 + 5, keys1 + 5); | |
swap_long_short(vals0 + 4, vals1 + 4, vals0 + 5, vals1 + 5); | |
} | |
if (comp(keys0[7], keys1[7], keys0[6], keys1[6])) | |
{ | |
swap_int_float(keys0 + 6, keys1 + 6, keys0 + 7, keys1 + 7); | |
swap_long_short(vals0 + 6, vals1 + 6, vals0 + 7, vals1 + 7); | |
} | |
if (comp(keys0[9], keys1[9], keys0[8], keys1[8])) | |
{ | |
swap_int_float(keys0 + 8, keys1 + 8, keys0 + 9, keys1 + 9); | |
swap_long_short(vals0 + 8, vals1 + 8, vals0 + 9, vals1 + 9); | |
} | |
} | |
int merge_path_int_float | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
local const int * a0, | |
local const float * a1, | |
local const int * b0, | |
local const float * b1 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
void block_sort_pass_1_11_int_float | |
( | |
int tid, | |
int count, | |
int coop, | |
int * indices, | |
local const int * keys_shared0, | |
local const float * keys_shared1, | |
int * keys0, | |
float * keys1 | |
) | |
{ | |
int list = ~(coop - 1) & tid; | |
int diag = min(count, 11 * ((coop - 1) & tid)); | |
int start = 11 * list; | |
int a0 = min(count, start); | |
int b0 = min(count, start + 11 * (coop / 2)); | |
int b1 = min(count, start + 11 * coop); | |
int p = merge_path_int_float(b0 - a0, b1 - b0, diag, keys_shared0 + a0, keys_shared1 + a0, keys_shared0 + b0, keys_shared1 + b0); | |
serial_merge_11_int_float(a0 + p, b0, b0 + diag - p, b1, indices, keys_shared0, keys_shared1, keys0, keys1); | |
} | |
void gather_1_11_long_short | |
( | |
const int * indices, | |
int tid, | |
local const long * data0, | |
local const short * data1, | |
long * reg0, | |
short * reg1 | |
) | |
{ | |
reg0[0] = data0[indices[0]]; | |
reg1[0] = data1[indices[0]]; | |
reg0[1] = data0[indices[1]]; | |
reg1[1] = data1[indices[1]]; | |
reg0[2] = data0[indices[2]]; | |
reg1[2] = data1[indices[2]]; | |
reg0[3] = data0[indices[3]]; | |
reg1[3] = data1[indices[3]]; | |
reg0[4] = data0[indices[4]]; | |
reg1[4] = data1[indices[4]]; | |
reg0[5] = data0[indices[5]]; | |
reg1[5] = data1[indices[5]]; | |
reg0[6] = data0[indices[6]]; | |
reg1[6] = data1[indices[6]]; | |
reg0[7] = data0[indices[7]]; | |
reg1[7] = data1[indices[7]]; | |
reg0[8] = data0[indices[8]]; | |
reg1[8] = data1[indices[8]]; | |
reg0[9] = data0[indices[9]]; | |
reg1[9] = data1[indices[9]]; | |
reg0[10] = data0[indices[10]]; | |
reg1[10] = data1[indices[10]]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void block_sort_loop_1_11_int_float_long_short | |
( | |
int tid, | |
int count, | |
local int * keys_shared0, | |
local float * keys_shared1, | |
long * thread_vals0, | |
short * thread_vals1, | |
local long * vals_shared0, | |
local short * vals_shared1 | |
) | |
{ | |
int indices[11]; | |
int keys0[11]; | |
float keys1[11]; | |
} | |
void mergesort_1_11_int_float_long_short | |
( | |
int count, | |
int tid, | |
int * thread_keys0, | |
float * thread_keys1, | |
local int * keys_shared0, | |
local float * keys_shared1, | |
long * thread_vals0, | |
short * thread_vals1, | |
local long * vals_shared0, | |
local short * vals_shared1 | |
) | |
{ | |
if(11 * tid < count) odd_even_transpose_sort_11_int_float_long_short(thread_keys0, thread_keys1, thread_vals0, thread_vals1); | |
thread_to_shared_11_int(thread_keys0, tid, keys_shared0); | |
thread_to_shared_11_float(thread_keys1, tid, keys_shared1); | |
block_sort_loop_1_11_int_float_long_short(tid, count, keys_shared0, keys_shared1, thread_vals0, thread_vals1, vals_shared0, vals_shared1); | |
} | |
kernel void block_sort | |
( | |
int count, | |
global const int * keys_src0, | |
global const float * keys_src1, | |
global int * keys_dst0, | |
global float * keys_dst1, | |
global const long * vals_src0, | |
global const short * vals_src1, | |
global long * vals_dst0, | |
global short * vals_dst1 | |
) | |
{ | |
union Shared | |
{ | |
struct | |
{ | |
int keys0[12]; | |
float keys1[12]; | |
}; | |
struct | |
{ | |
long vals0[11]; | |
short vals1[11]; | |
}; | |
}; | |
local union Shared shared; | |
int tid = get_local_id(0); | |
int block = get_group_id(0); | |
int gid = 11 * block; | |
int count2 = min(11, count - gid); | |
long thread_vals0[11]; | |
short thread_vals1[11]; | |
global_to_shared_1_11_long(count2, vals_src0 + gid, tid, shared.vals0); | |
global_to_shared_1_11_short(count2, vals_src1 + gid, tid, shared.vals1); | |
shared_to_thread_11_long(shared.vals0, tid, thread_vals0); | |
shared_to_thread_11_short(shared.vals1, tid, thread_vals1); | |
int thread_keys0[11]; | |
float thread_keys1[11]; | |
global_to_shared_1_11_int(count2, keys_src0 + gid, tid, shared.keys0); | |
global_to_shared_1_11_float(count2, keys_src1 + gid, tid, shared.keys1); | |
shared_to_thread_11_int(shared.keys0, tid, thread_keys0); | |
shared_to_thread_11_float(shared.keys1, tid, thread_keys1); | |
int first = 11 * tid; | |
if(first + 11 > count2 && first < count2) | |
{ | |
int max_key0 = thread_keys0[0]; | |
float max_key1 = thread_keys1[0]; | |
if(first + 1 < count2 && comp(max_key0, max_key1, thread_keys0[1], thread_keys1[1]) ) | |
{ | |
max_key0 = thread_keys0[1]; | |
max_key1 = thread_keys1[1]; | |
} | |
if(first + 2 < count2 && comp(max_key0, max_key1, thread_keys0[2], thread_keys1[2]) ) | |
{ | |
max_key0 = thread_keys0[2]; | |
max_key1 = thread_keys1[2]; | |
} | |
if(first + 3 < count2 && comp(max_key0, max_key1, thread_keys0[3], thread_keys1[3]) ) | |
{ | |
max_key0 = thread_keys0[3]; | |
max_key1 = thread_keys1[3]; | |
} | |
if(first + 4 < count2 && comp(max_key0, max_key1, thread_keys0[4], thread_keys1[4]) ) | |
{ | |
max_key0 = thread_keys0[4]; | |
max_key1 = thread_keys1[4]; | |
} | |
if(first + 5 < count2 && comp(max_key0, max_key1, thread_keys0[5], thread_keys1[5]) ) | |
{ | |
max_key0 = thread_keys0[5]; | |
max_key1 = thread_keys1[5]; | |
} | |
if(first + 6 < count2 && comp(max_key0, max_key1, thread_keys0[6], thread_keys1[6]) ) | |
{ | |
max_key0 = thread_keys0[6]; | |
max_key1 = thread_keys1[6]; | |
} | |
if(first + 7 < count2 && comp(max_key0, max_key1, thread_keys0[7], thread_keys1[7]) ) | |
{ | |
max_key0 = thread_keys0[7]; | |
max_key1 = thread_keys1[7]; | |
} | |
if(first + 8 < count2 && comp(max_key0, max_key1, thread_keys0[8], thread_keys1[8]) ) | |
{ | |
max_key0 = thread_keys0[8]; | |
max_key1 = thread_keys1[8]; | |
} | |
if(first + 9 < count2 && comp(max_key0, max_key1, thread_keys0[9], thread_keys1[9]) ) | |
{ | |
max_key0 = thread_keys0[9]; | |
max_key1 = thread_keys1[9]; | |
} | |
if(first + 10 < count2 && comp(max_key0, max_key1, thread_keys0[10], thread_keys1[10]) ) | |
{ | |
max_key0 = thread_keys0[10]; | |
max_key1 = thread_keys1[10]; | |
} | |
if(first + 0 >= count2) | |
{ | |
thread_keys0[0] = max_key0; | |
thread_keys1[0] = max_key1; | |
} | |
if(first + 1 >= count2) | |
{ | |
thread_keys0[1] = max_key0; | |
thread_keys1[1] = max_key1; | |
} | |
if(first + 2 >= count2) | |
{ | |
thread_keys0[2] = max_key0; | |
thread_keys1[2] = max_key1; | |
} | |
if(first + 3 >= count2) | |
{ | |
thread_keys0[3] = max_key0; | |
thread_keys1[3] = max_key1; | |
} | |
if(first + 4 >= count2) | |
{ | |
thread_keys0[4] = max_key0; | |
thread_keys1[4] = max_key1; | |
} | |
if(first + 5 >= count2) | |
{ | |
thread_keys0[5] = max_key0; | |
thread_keys1[5] = max_key1; | |
} | |
if(first + 6 >= count2) | |
{ | |
thread_keys0[6] = max_key0; | |
thread_keys1[6] = max_key1; | |
} | |
if(first + 7 >= count2) | |
{ | |
thread_keys0[7] = max_key0; | |
thread_keys1[7] = max_key1; | |
} | |
if(first + 8 >= count2) | |
{ | |
thread_keys0[8] = max_key0; | |
thread_keys1[8] = max_key1; | |
} | |
if(first + 9 >= count2) | |
{ | |
thread_keys0[9] = max_key0; | |
thread_keys1[9] = max_key1; | |
} | |
if(first + 10 >= count2) | |
{ | |
thread_keys0[10] = max_key0; | |
thread_keys1[10] = max_key1; | |
} | |
} | |
mergesort_1_11_int_float_long_short(count2, tid, thread_keys0, thread_keys1, shared.keys0, shared.keys1, thread_vals0, thread_vals1, shared.vals0, shared.vals1); | |
shared_to_global_1_11_int(count2, shared.keys0, tid, keys_dst0 + gid); | |
shared_to_global_1_11_float(count2, shared.keys1, tid, keys_dst1 + gid); | |
thread_to_shared_11_long(thread_vals0, tid, shared.vals0); | |
thread_to_shared_11_short(thread_vals1, tid, shared.vals1); | |
shared_to_global_1_11_long(count2, shared.vals0, tid, vals_dst0 + gid); | |
shared_to_global_1_11_short(count2, shared.vals1, tid, vals_dst1 + gid); | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
float a2, | |
int b1, | |
float b2 | |
) | |
{ | |
return (a1 == b1) ? (a2 < b2) : (a1 < b1); | |
} | |
int4 find_mergesort_frame | |
( | |
int coop, | |
int block, | |
int nv | |
) | |
{ | |
int start = ~(coop - 1) & block; | |
int size = nv * (coop>> 1); | |
int4 frame; | |
frame.x = nv * start; | |
frame.y = nv * start + size; | |
frame.z = size; | |
return frame; | |
} | |
int4 find_mergesort_interval | |
( | |
int4 frame, | |
int coop, | |
int block, | |
int nv, | |
int count, | |
int mp0, | |
int mp1 | |
) | |
{ | |
int diag = nv * block - frame.x; | |
int4 interval; | |
interval.x = frame.x + mp0; | |
interval.y = min(count, frame.x + mp1); | |
interval.z = min(count, frame.y + diag - mp0); | |
interval.w = min(count, frame.y + diag + nv - mp1); | |
if(coop - 1 == ((coop - 1) & block)) | |
{ | |
interval.y = min(count, frame.x + frame.z); | |
interval.w = min(count, frame.y + frame.z); | |
} | |
return interval; | |
} | |
int4 compute_merge_range | |
( | |
int a_count, | |
int b_count, | |
int block, | |
int coop, | |
int nv, | |
global const int * mp_global | |
) | |
{ | |
int mp0 = mp_global[block]; | |
int mp1 = mp_global[block + 1]; | |
int gid = nv * block; | |
int4 range; | |
if(coop) | |
{ | |
int4 frame = find_mergesort_frame(coop, block, nv); | |
range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1); | |
} | |
else | |
{ | |
range.x = mp0; | |
range.y = mp1; | |
range.z = gid - range.x; | |
range.w = min(a_count + b_count, gid + nv) - range.y; | |
} | |
return range; | |
} | |
void global_to_regstr_pred_1_11_short | |
( | |
int count, | |
global const short * data, | |
int tid, | |
short * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_short | |
( | |
int count, | |
global const short * data, | |
int tid, | |
short * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_short(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_short | |
( | |
int count, | |
const short * reg, | |
int tid, | |
global short * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_short | |
( | |
local const short * data, | |
int tid, | |
short * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_short | |
( | |
const short * reg, | |
int tid, | |
local short * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_short | |
( | |
int count, | |
global const short * source, | |
int tid, | |
local short * dest | |
) | |
{ | |
short reg[11]; | |
global_to_regstr_1_11_short(count, source, tid, reg); | |
regstr_to_shared_1_11_short(reg, tid, dest); | |
} | |
void shared_to_global_1_11_short | |
( | |
int count, | |
local const short * source, | |
int tid, | |
global short * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_short | |
( | |
local const short * data, | |
int tid, | |
short * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_short | |
( | |
const short * reg, | |
int tid, | |
local short * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_long | |
( | |
int count, | |
global const long * data, | |
int tid, | |
long * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_long | |
( | |
int count, | |
global const long * data, | |
int tid, | |
long * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_long(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_long | |
( | |
int count, | |
const long * reg, | |
int tid, | |
global long * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_long | |
( | |
local const long * data, | |
int tid, | |
long * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_long | |
( | |
const long * reg, | |
int tid, | |
local long * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_long | |
( | |
int count, | |
global const long * source, | |
int tid, | |
local long * dest | |
) | |
{ | |
long reg[11]; | |
global_to_regstr_1_11_long(count, source, tid, reg); | |
regstr_to_shared_1_11_long(reg, tid, dest); | |
} | |
void shared_to_global_1_11_long | |
( | |
int count, | |
local const long * source, | |
int tid, | |
global long * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_long | |
( | |
local const long * data, | |
int tid, | |
long * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_long | |
( | |
const long * reg, | |
int tid, | |
local long * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_float | |
( | |
int count, | |
global const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_float(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_float | |
( | |
int count, | |
const float * reg, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_float | |
( | |
int count, | |
global const float * source, | |
int tid, | |
local float * dest | |
) | |
{ | |
float reg[11]; | |
global_to_regstr_1_11_float(count, source, tid, reg); | |
regstr_to_shared_1_11_float(reg, tid, dest); | |
} | |
void shared_to_global_1_11_float | |
( | |
int count, | |
local const float * source, | |
int tid, | |
global float * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_float | |
( | |
local const float * data, | |
int tid, | |
float * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_float | |
( | |
const float * reg, | |
int tid, | |
local float * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_regstr_pred_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) reg[0] = data[index]; | |
index = 1 + tid; | |
if (index < count) reg[1] = data[index]; | |
index = 2 + tid; | |
if (index < count) reg[2] = data[index]; | |
index = 3 + tid; | |
if (index < count) reg[3] = data[index]; | |
index = 4 + tid; | |
if (index < count) reg[4] = data[index]; | |
index = 5 + tid; | |
if (index < count) reg[5] = data[index]; | |
index = 6 + tid; | |
if (index < count) reg[6] = data[index]; | |
index = 7 + tid; | |
if (index < count) reg[7] = data[index]; | |
index = 8 + tid; | |
if (index < count) reg[8] = data[index]; | |
index = 9 + tid; | |
if (index < count) reg[9] = data[index]; | |
index = 10 + tid; | |
if (index < count) reg[10] = data[index]; | |
} | |
void global_to_regstr_1_11_int | |
( | |
int count, | |
global const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
if (count >= 11) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
} else global_to_regstr_pred_1_11_int(count, data, tid, reg); | |
} | |
void regstr_to_global_1_11_int | |
( | |
int count, | |
const int * reg, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = reg[0]; | |
index = 1 + tid; | |
if (index < count) dest[index] = reg[1]; | |
index = 2 + tid; | |
if (index < count) dest[index] = reg[2]; | |
index = 3 + tid; | |
if (index < count) dest[index] = reg[3]; | |
index = 4 + tid; | |
if (index < count) dest[index] = reg[4]; | |
index = 5 + tid; | |
if (index < count) dest[index] = reg[5]; | |
index = 6 + tid; | |
if (index < count) dest[index] = reg[6]; | |
index = 7 + tid; | |
if (index < count) dest[index] = reg[7]; | |
index = 8 + tid; | |
if (index < count) dest[index] = reg[8]; | |
index = 9 + tid; | |
if (index < count) dest[index] = reg[9]; | |
index = 10 + tid; | |
if (index < count) dest[index] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_regstr_1_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[0 + tid]; | |
reg[1] = data[1 + tid]; | |
reg[2] = data[2 + tid]; | |
reg[3] = data[3 + tid]; | |
reg[4] = data[4 + tid]; | |
reg[5] = data[5 + tid]; | |
reg[6] = data[6 + tid]; | |
reg[7] = data[7 + tid]; | |
reg[8] = data[8 + tid]; | |
reg[9] = data[9 + tid]; | |
reg[10] = data[10 + tid]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void regstr_to_shared_1_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[0 + tid] = reg[0]; | |
dest[1 + tid] = reg[1]; | |
dest[2 + tid] = reg[2]; | |
dest[3 + tid] = reg[3]; | |
dest[4 + tid] = reg[4]; | |
dest[5 + tid] = reg[5]; | |
dest[6 + tid] = reg[6]; | |
dest[7 + tid] = reg[7]; | |
dest[8 + tid] = reg[8]; | |
dest[9 + tid] = reg[9]; | |
dest[10 + tid] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void global_to_shared_1_11_int | |
( | |
int count, | |
global const int * source, | |
int tid, | |
local int * dest | |
) | |
{ | |
int reg[11]; | |
global_to_regstr_1_11_int(count, source, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, dest); | |
} | |
void shared_to_global_1_11_int | |
( | |
int count, | |
local const int * source, | |
int tid, | |
global int * dest | |
) | |
{ | |
int index; | |
index = 0 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 1 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 2 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 3 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 4 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 5 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 6 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 7 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 8 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 9 + tid; | |
if (index < count) dest[index] = source[index]; | |
index = 10 + tid; | |
if (index < count) dest[index] = source[index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void shared_to_thread_11_int | |
( | |
local const int * data, | |
int tid, | |
int * reg | |
) | |
{ | |
reg[0] = data[11 * tid + 0]; | |
reg[1] = data[11 * tid + 1]; | |
reg[2] = data[11 * tid + 2]; | |
reg[3] = data[11 * tid + 3]; | |
reg[4] = data[11 * tid + 4]; | |
reg[5] = data[11 * tid + 5]; | |
reg[6] = data[11 * tid + 6]; | |
reg[7] = data[11 * tid + 7]; | |
reg[8] = data[11 * tid + 8]; | |
reg[9] = data[11 * tid + 9]; | |
reg[10] = data[11 * tid + 10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void thread_to_shared_11_int | |
( | |
const int * reg, | |
int tid, | |
local int * dest | |
) | |
{ | |
dest[11 * tid + 0] = reg[0]; | |
dest[11 * tid + 1] = reg[1]; | |
dest[11 * tid + 2] = reg[2]; | |
dest[11 * tid + 3] = reg[3]; | |
dest[11 * tid + 4] = reg[4]; | |
dest[11 * tid + 5] = reg[5]; | |
dest[11 * tid + 6] = reg[6]; | |
dest[11 * tid + 7] = reg[7]; | |
dest[11 * tid + 8] = reg[8]; | |
dest[11 * tid + 9] = reg[9]; | |
dest[11 * tid + 10] = reg[10]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void serial_merge_11_int_float | |
( | |
int a_begin, | |
int a_end, | |
int b_begin, | |
int b_end, | |
int * indices, | |
local const int * keys_shared0, | |
local const float * keys_shared1, | |
int * results0, | |
float * results1 | |
) | |
{ | |
int a_key0 = keys_shared0[a_begin]; | |
int b_key0 = keys_shared0[b_begin]; | |
float a_key1 = keys_shared1[a_begin]; | |
float b_key1 = keys_shared1[b_begin]; | |
bool p; | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[0] = p ? a_key0 : b_key0; | |
results1[0] = p ? a_key1 : b_key1; | |
indices[0] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[1] = p ? a_key0 : b_key0; | |
results1[1] = p ? a_key1 : b_key1; | |
indices[1] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[2] = p ? a_key0 : b_key0; | |
results1[2] = p ? a_key1 : b_key1; | |
indices[2] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[3] = p ? a_key0 : b_key0; | |
results1[3] = p ? a_key1 : b_key1; | |
indices[3] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[4] = p ? a_key0 : b_key0; | |
results1[4] = p ? a_key1 : b_key1; | |
indices[4] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[5] = p ? a_key0 : b_key0; | |
results1[5] = p ? a_key1 : b_key1; | |
indices[5] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[6] = p ? a_key0 : b_key0; | |
results1[6] = p ? a_key1 : b_key1; | |
indices[6] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[7] = p ? a_key0 : b_key0; | |
results1[7] = p ? a_key1 : b_key1; | |
indices[7] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[8] = p ? a_key0 : b_key0; | |
results1[8] = p ? a_key1 : b_key1; | |
indices[8] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[9] = p ? a_key0 : b_key0; | |
results1[9] = p ? a_key1 : b_key1; | |
indices[9] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); | |
results0[10] = p ? a_key0 : b_key0; | |
results1[10] = p ? a_key1 : b_key1; | |
indices[10] = p ? a_begin : b_begin; | |
if(p) | |
{ | |
++a_begin; | |
a_key0 = keys_shared0[a_begin]; | |
a_key1 = keys_shared1[a_begin]; | |
} | |
else | |
{ | |
++b_begin; | |
b_key0 = keys_shared0[b_begin]; | |
b_key1 = keys_shared1[b_begin]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
int merge_path_int_float | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
local const int * a0, | |
local const float * a1, | |
local const int * b0, | |
local const float * b1 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
void load2_to_regstr_1_11_11_float | |
( | |
global const float * a_global, | |
int a_count, | |
global const float * b_global, | |
int b_count, | |
int tid, | |
float * reg | |
) | |
{ | |
b_global -= a_count; | |
int total = a_count + b_count; | |
int index; | |
if (total >= 11) | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else reg[6] = b_global[index]; | |
index = 7 + tid; | |
if (index < a_count) reg[7] = a_global[index]; | |
else reg[7] = b_global[index]; | |
index = 8 + tid; | |
if (index < a_count) reg[8] = a_global[index]; | |
else reg[8] = b_global[index]; | |
index = 9 + tid; | |
if (index < a_count) reg[9] = a_global[index]; | |
else reg[9] = b_global[index]; | |
index = 10 + tid; | |
if (index < a_count) reg[10] = a_global[index]; | |
else reg[10] = b_global[index]; | |
} | |
else | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else if (index < total) reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else if (index < total) reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else if (index < total) reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else if (index < total) reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else if (index < total) reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else if (index < total) reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else if (index < total) reg[6] = b_global[index]; | |
index = 7 + tid; | |
if (index < a_count) reg[7] = a_global[index]; | |
else if (index < total) reg[7] = b_global[index]; | |
index = 8 + tid; | |
if (index < a_count) reg[8] = a_global[index]; | |
else if (index < total) reg[8] = b_global[index]; | |
index = 9 + tid; | |
if (index < a_count) reg[9] = a_global[index]; | |
else if (index < total) reg[9] = b_global[index]; | |
index = 10 + tid; | |
if (index < a_count) reg[10] = a_global[index]; | |
else if (index < total) reg[10] = b_global[index]; | |
} | |
} | |
void load2_to_shared_1_11_11_float | |
( | |
global const float * a_global, | |
int a_count, | |
global const float * b_global, | |
int b_count, | |
int tid, | |
local float * shared | |
) | |
{ | |
float reg[11]; | |
load2_to_regstr_1_11_11_float(a_global, a_count, b_global, b_count, tid, reg); | |
regstr_to_shared_1_11_float(reg, tid, shared); | |
} | |
void load2_to_regstr_1_11_11_int | |
( | |
global const int * a_global, | |
int a_count, | |
global const int * b_global, | |
int b_count, | |
int tid, | |
int * reg | |
) | |
{ | |
b_global -= a_count; | |
int total = a_count + b_count; | |
int index; | |
if (total >= 11) | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else reg[6] = b_global[index]; | |
index = 7 + tid; | |
if (index < a_count) reg[7] = a_global[index]; | |
else reg[7] = b_global[index]; | |
index = 8 + tid; | |
if (index < a_count) reg[8] = a_global[index]; | |
else reg[8] = b_global[index]; | |
index = 9 + tid; | |
if (index < a_count) reg[9] = a_global[index]; | |
else reg[9] = b_global[index]; | |
index = 10 + tid; | |
if (index < a_count) reg[10] = a_global[index]; | |
else reg[10] = b_global[index]; | |
} | |
else | |
{ | |
index = 0 + tid; | |
if (index < a_count) reg[0] = a_global[index]; | |
else if (index < total) reg[0] = b_global[index]; | |
index = 1 + tid; | |
if (index < a_count) reg[1] = a_global[index]; | |
else if (index < total) reg[1] = b_global[index]; | |
index = 2 + tid; | |
if (index < a_count) reg[2] = a_global[index]; | |
else if (index < total) reg[2] = b_global[index]; | |
index = 3 + tid; | |
if (index < a_count) reg[3] = a_global[index]; | |
else if (index < total) reg[3] = b_global[index]; | |
index = 4 + tid; | |
if (index < a_count) reg[4] = a_global[index]; | |
else if (index < total) reg[4] = b_global[index]; | |
index = 5 + tid; | |
if (index < a_count) reg[5] = a_global[index]; | |
else if (index < total) reg[5] = b_global[index]; | |
index = 6 + tid; | |
if (index < a_count) reg[6] = a_global[index]; | |
else if (index < total) reg[6] = b_global[index]; | |
index = 7 + tid; | |
if (index < a_count) reg[7] = a_global[index]; | |
else if (index < total) reg[7] = b_global[index]; | |
index = 8 + tid; | |
if (index < a_count) reg[8] = a_global[index]; | |
else if (index < total) reg[8] = b_global[index]; | |
index = 9 + tid; | |
if (index < a_count) reg[9] = a_global[index]; | |
else if (index < total) reg[9] = b_global[index]; | |
index = 10 + tid; | |
if (index < a_count) reg[10] = a_global[index]; | |
else if (index < total) reg[10] = b_global[index]; | |
} | |
} | |
void load2_to_shared_1_11_11_int | |
( | |
global const int * a_global, | |
int a_count, | |
global const int * b_global, | |
int b_count, | |
int tid, | |
local int * shared | |
) | |
{ | |
int reg[11]; | |
load2_to_regstr_1_11_11_int(a_global, a_count, b_global, b_count, tid, reg); | |
regstr_to_shared_1_11_int(reg, tid, shared); | |
} | |
void merge_keys_indices_1_11_int_float | |
( | |
int a_count, | |
int b_count, | |
int4 range, | |
int tid, | |
int * indices, | |
global const int * a_global0, | |
global const float * a_global1, | |
global const int * b_global0, | |
global const float * b_global1, | |
local int * keys_shared0, | |
local float * keys_shared1, | |
int * results0, | |
float * results1 | |
) | |
{ | |
int a0 = range.x; | |
int a1 = range.y; | |
int b0 = range.z; | |
int b1 = range.w; | |
a_count = a1 - a0; | |
b_count = b1 - b0; | |
load2_to_shared_1_11_11_int(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0); | |
load2_to_shared_1_11_11_float(a_global1 + a0, a_count, b_global1 + b0, b_count, tid, keys_shared1); | |
int diag = 11 * tid; | |
int mp = merge_path_int_float(a_count, b_count, diag, keys_shared0, keys_shared1, keys_shared0 + a_count, keys_shared1 + a_count); | |
int a0tid = mp; | |
int a1tid = a_count; | |
int b0tid = a_count + diag - mp; | |
int b1tid = a_count + b_count; | |
serial_merge_11_int_float(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, keys_shared1, results0, results1); | |
} | |
void transfer_merge_values_regstr_1_11_long_short | |
( | |
int count, | |
int b_start, | |
const int * indices, | |
int tid, | |
global const long * a_global0, | |
global const short * a_global1, | |
global const long * b_global0, | |
global const short * b_global1, | |
long * reg0, | |
short * reg1 | |
) | |
{ | |
b_global0 -= b_start; | |
b_global1 -= b_start; | |
if(count >= 11) | |
{ | |
if (indices[0] < b_start) | |
{ | |
reg0[0] = a_global0[indices[0]]; | |
reg1[0] = a_global1[indices[0]]; | |
} | |
else | |
{ | |
reg0[0] = b_global0[indices[0]]; | |
reg1[0] = b_global1[indices[0]]; | |
} | |
if (indices[1] < b_start) | |
{ | |
reg0[1] = a_global0[indices[1]]; | |
reg1[1] = a_global1[indices[1]]; | |
} | |
else | |
{ | |
reg0[1] = b_global0[indices[1]]; | |
reg1[1] = b_global1[indices[1]]; | |
} | |
if (indices[2] < b_start) | |
{ | |
reg0[2] = a_global0[indices[2]]; | |
reg1[2] = a_global1[indices[2]]; | |
} | |
else | |
{ | |
reg0[2] = b_global0[indices[2]]; | |
reg1[2] = b_global1[indices[2]]; | |
} | |
if (indices[3] < b_start) | |
{ | |
reg0[3] = a_global0[indices[3]]; | |
reg1[3] = a_global1[indices[3]]; | |
} | |
else | |
{ | |
reg0[3] = b_global0[indices[3]]; | |
reg1[3] = b_global1[indices[3]]; | |
} | |
if (indices[4] < b_start) | |
{ | |
reg0[4] = a_global0[indices[4]]; | |
reg1[4] = a_global1[indices[4]]; | |
} | |
else | |
{ | |
reg0[4] = b_global0[indices[4]]; | |
reg1[4] = b_global1[indices[4]]; | |
} | |
if (indices[5] < b_start) | |
{ | |
reg0[5] = a_global0[indices[5]]; | |
reg1[5] = a_global1[indices[5]]; | |
} | |
else | |
{ | |
reg0[5] = b_global0[indices[5]]; | |
reg1[5] = b_global1[indices[5]]; | |
} | |
if (indices[6] < b_start) | |
{ | |
reg0[6] = a_global0[indices[6]]; | |
reg1[6] = a_global1[indices[6]]; | |
} | |
else | |
{ | |
reg0[6] = b_global0[indices[6]]; | |
reg1[6] = b_global1[indices[6]]; | |
} | |
if (indices[7] < b_start) | |
{ | |
reg0[7] = a_global0[indices[7]]; | |
reg1[7] = a_global1[indices[7]]; | |
} | |
else | |
{ | |
reg0[7] = b_global0[indices[7]]; | |
reg1[7] = b_global1[indices[7]]; | |
} | |
if (indices[8] < b_start) | |
{ | |
reg0[8] = a_global0[indices[8]]; | |
reg1[8] = a_global1[indices[8]]; | |
} | |
else | |
{ | |
reg0[8] = b_global0[indices[8]]; | |
reg1[8] = b_global1[indices[8]]; | |
} | |
if (indices[9] < b_start) | |
{ | |
reg0[9] = a_global0[indices[9]]; | |
reg1[9] = a_global1[indices[9]]; | |
} | |
else | |
{ | |
reg0[9] = b_global0[indices[9]]; | |
reg1[9] = b_global1[indices[9]]; | |
} | |
if (indices[10] < b_start) | |
{ | |
reg0[10] = a_global0[indices[10]]; | |
reg1[10] = a_global1[indices[10]]; | |
} | |
else | |
{ | |
reg0[10] = b_global0[indices[10]]; | |
reg1[10] = b_global1[indices[10]]; | |
} | |
} | |
else | |
{ | |
int index; | |
index = 0 + tid; | |
if(index < count) | |
{ | |
if (indices[0] < b_start) | |
{ | |
reg0[0] = a_global0[indices[0]]; | |
reg1[0] = a_global1[indices[0]]; | |
} | |
else | |
{ | |
reg0[0] = b_global0[indices[0]]; | |
reg1[0] = b_global1[indices[0]]; | |
} | |
} | |
index = 1 + tid; | |
if(index < count) | |
{ | |
if (indices[1] < b_start) | |
{ | |
reg0[1] = a_global0[indices[1]]; | |
reg1[1] = a_global1[indices[1]]; | |
} | |
else | |
{ | |
reg0[1] = b_global0[indices[1]]; | |
reg1[1] = b_global1[indices[1]]; | |
} | |
} | |
index = 2 + tid; | |
if(index < count) | |
{ | |
if (indices[2] < b_start) | |
{ | |
reg0[2] = a_global0[indices[2]]; | |
reg1[2] = a_global1[indices[2]]; | |
} | |
else | |
{ | |
reg0[2] = b_global0[indices[2]]; | |
reg1[2] = b_global1[indices[2]]; | |
} | |
} | |
index = 3 + tid; | |
if(index < count) | |
{ | |
if (indices[3] < b_start) | |
{ | |
reg0[3] = a_global0[indices[3]]; | |
reg1[3] = a_global1[indices[3]]; | |
} | |
else | |
{ | |
reg0[3] = b_global0[indices[3]]; | |
reg1[3] = b_global1[indices[3]]; | |
} | |
} | |
index = 4 + tid; | |
if(index < count) | |
{ | |
if (indices[4] < b_start) | |
{ | |
reg0[4] = a_global0[indices[4]]; | |
reg1[4] = a_global1[indices[4]]; | |
} | |
else | |
{ | |
reg0[4] = b_global0[indices[4]]; | |
reg1[4] = b_global1[indices[4]]; | |
} | |
} | |
index = 5 + tid; | |
if(index < count) | |
{ | |
if (indices[5] < b_start) | |
{ | |
reg0[5] = a_global0[indices[5]]; | |
reg1[5] = a_global1[indices[5]]; | |
} | |
else | |
{ | |
reg0[5] = b_global0[indices[5]]; | |
reg1[5] = b_global1[indices[5]]; | |
} | |
} | |
index = 6 + tid; | |
if(index < count) | |
{ | |
if (indices[6] < b_start) | |
{ | |
reg0[6] = a_global0[indices[6]]; | |
reg1[6] = a_global1[indices[6]]; | |
} | |
else | |
{ | |
reg0[6] = b_global0[indices[6]]; | |
reg1[6] = b_global1[indices[6]]; | |
} | |
} | |
index = 7 + tid; | |
if(index < count) | |
{ | |
if (indices[7] < b_start) | |
{ | |
reg0[7] = a_global0[indices[7]]; | |
reg1[7] = a_global1[indices[7]]; | |
} | |
else | |
{ | |
reg0[7] = b_global0[indices[7]]; | |
reg1[7] = b_global1[indices[7]]; | |
} | |
} | |
index = 8 + tid; | |
if(index < count) | |
{ | |
if (indices[8] < b_start) | |
{ | |
reg0[8] = a_global0[indices[8]]; | |
reg1[8] = a_global1[indices[8]]; | |
} | |
else | |
{ | |
reg0[8] = b_global0[indices[8]]; | |
reg1[8] = b_global1[indices[8]]; | |
} | |
} | |
index = 9 + tid; | |
if(index < count) | |
{ | |
if (indices[9] < b_start) | |
{ | |
reg0[9] = a_global0[indices[9]]; | |
reg1[9] = a_global1[indices[9]]; | |
} | |
else | |
{ | |
reg0[9] = b_global0[indices[9]]; | |
reg1[9] = b_global1[indices[9]]; | |
} | |
} | |
index = 10 + tid; | |
if(index < count) | |
{ | |
if (indices[10] < b_start) | |
{ | |
reg0[10] = a_global0[indices[10]]; | |
reg1[10] = a_global1[indices[10]]; | |
} | |
else | |
{ | |
reg0[10] = b_global0[indices[10]]; | |
reg1[10] = b_global1[indices[10]]; | |
} | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
void transfer_merge_values_shared_1_11_long_short | |
( | |
int count, | |
int b_start, | |
local const int * indices_shared, | |
int tid, | |
global const long * a_global0, | |
global const short * a_global1, | |
global const long * b_global0, | |
global const short * b_global1, | |
global long * dest_global0, | |
global short * dest_global1 | |
) | |
{ | |
int indices[11]; | |
shared_to_regstr_1_11_int(indices_shared, tid, indices); | |
long reg0[11]; | |
short reg1[11]; | |
transfer_merge_values_regstr_1_11_long_short(count, b_start, indices, tid, a_global0, a_global1, b_global0, b_global1, reg0, reg1); | |
regstr_to_global_1_11_long(count, reg0, tid, dest_global0); | |
regstr_to_global_1_11_short(count, reg1, tid, dest_global1); | |
} | |
void device_merge_1_11_int_float_long_short | |
( | |
int a_count, | |
int b_count, | |
global const int * a_keys_global0, | |
global const float * a_keys_global1, | |
global const int * b_keys_global0, | |
global const float * b_keys_global1, | |
global int * keys_global0, | |
global float * keys_global1, | |
local int * keys_shared0, | |
local float * keys_shared1, | |
global const long * a_vals_global0, | |
global const short * a_vals_global1, | |
global const long * b_vals_global0, | |
global const short * b_vals_global1, | |
global long * vals_global0, | |
global short * vals_global1, | |
int tid, | |
int block, | |
int4 range, | |
local int * indices_shared | |
) | |
{ | |
int results0[11]; | |
float results1[11]; | |
int indices[11]; | |
merge_keys_indices_1_11_int_float(a_count, b_count, range, tid, indices, a_keys_global0, a_keys_global1, b_keys_global0, b_keys_global1, keys_shared0, keys_shared1, results0, results1); | |
thread_to_shared_11_int(results0, tid, keys_shared0); | |
thread_to_shared_11_float(results1, tid, keys_shared1); | |
a_count = range.y - range.x; | |
b_count = range.w - range.z; | |
shared_to_global_1_11_int(a_count + b_count, keys_shared0, tid, keys_global0 + 11 * block); | |
shared_to_global_1_11_float(a_count + b_count, keys_shared1, tid, keys_global1 + 11 * block); | |
thread_to_shared_11_int(indices, tid, indices_shared); | |
transfer_merge_values_shared_1_11_long_short(a_count + b_count, a_count, indices_shared, tid, a_vals_global0 + range.x, a_vals_global1 + range.x, b_vals_global0 + range.z, b_vals_global1 + range.z, vals_global0 + 11 * block, vals_global1 + 11 * block); | |
} | |
kernel void merge | |
( | |
int a_count, | |
int b_count, | |
global const int * a_keys_global0, | |
global const float * a_keys_global1, | |
global const int * b_keys_global0, | |
global const float * b_keys_global1, | |
global int * keys_global0, | |
global float * keys_global1, | |
global const long * a_vals_global0, | |
global const short * a_vals_global1, | |
global const long * b_vals_global0, | |
global const short * b_vals_global1, | |
global long * vals_global0, | |
global short * vals_global1, | |
global const int * mp_global, | |
int coop | |
) | |
{ | |
union Shared | |
{ | |
struct | |
{ | |
int keys0[12]; | |
float keys1[12]; | |
}; | |
int indices[11]; | |
}; | |
local union Shared shared; | |
int tid = get_local_id(0); | |
int block = get_group_id(0); | |
int4 range = compute_merge_range(a_count, b_count, block, coop, 11, mp_global); | |
device_merge_1_11_int_float_long_short(a_count, b_count, a_keys_global0, a_keys_global1, b_keys_global0, b_keys_global1, keys_global0, keys_global1, shared.keys0, shared.keys1, a_vals_global0, a_vals_global1, b_vals_global0, b_vals_global1, vals_global0, vals_global1, tid, block, range, shared.indices); | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
float a2, | |
int b1, | |
float b2 | |
) | |
{ | |
return (a1 == b1) ? (a2 < b2) : (a1 < b1); | |
} | |
int merge_path_int_float | |
( | |
int a_count, | |
int b_count, | |
int diag, | |
global const int * a0, | |
global const float * a1, | |
global const int * b0, | |
global const float * b1 | |
) | |
{ | |
int begin = max(0, diag - b_count); | |
int end = min(diag, a_count); | |
while (begin < end) | |
{ | |
int mid = (begin + end) >> 1; | |
if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1; | |
else end = mid; | |
} | |
return begin; | |
} | |
int4 find_mergesort_frame | |
( | |
int coop, | |
int block, | |
int nv | |
) | |
{ | |
int start = ~(coop - 1) & block; | |
int size = nv * (coop>> 1); | |
int4 frame; | |
frame.x = nv * start; | |
frame.y = nv * start + size; | |
frame.z = size; | |
return frame; | |
} | |
kernel void merge_partition | |
( | |
int a_count, | |
int b_count, | |
int nv, | |
int coop, | |
global int * mp_global, | |
int num_searches, | |
global const int * a_global0, | |
global const float * a_global1, | |
global const int * b_global0, | |
global const float * b_global1 | |
) | |
{ | |
int partition = get_global_id(0); | |
if (partition < num_searches) | |
{ | |
int a0 = 0, b0 = 0; | |
int gid = nv * partition; | |
if(coop) | |
{ | |
int4 frame = find_mergesort_frame(coop, partition, nv); | |
a0 = frame.x; | |
b0 = min(a_count, frame.y); | |
b_count = min(a_count, frame.y + frame.z) - b0; | |
a_count = min(a_count, frame.x + frame.z) - a0; | |
gid -= a0; | |
} | |
int mp = merge_path_int_float(a_count, b_count, min(gid, a_count + b_count), a_global0 + a0, a_global1 + a0, b_global0 + b0, b_global1 + b0); | |
mp_global[partition] = mp; | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 16.72 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"sort" end time: Jan 30 11:28 IST | |
"sort" time elapsed: 00:00:16 | |
---------------------------------------------------------- | |
22/30 Testing: scan | |
22/30 Test: scan | |
Command: "/tmp/vexcl/build/tests/scan" | |
Directory: /tmp/vexcl/build/tests | |
"scan" start time: Jan 30 11:28 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597490 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 3 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_inclusive_scan | |
( | |
ulong n, | |
global const int * input, | |
int identity, | |
global int * scan_buf1, | |
global int * scan_buf2, | |
int exclusive | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
size_t offset = 1; | |
local int shared[2]; | |
if(block * 2 + l_id < n) shared[l_id] = input[block * 2 + l_id]; | |
if(block * 2 + l_id + 1 < n) shared[l_id + 1] = input[block * 2 + l_id + 1]; | |
if(exclusive && g_id == 0) shared[l_id] = oper(identity, input[0]); | |
for (size_t start = 1; start > 0; start >>= 1, offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id < start) | |
{ | |
size_t temp1 = offset * (2 * l_id + 1) - 1; | |
size_t temp2 = offset * (2 * l_id + 2) - 1; | |
int y2 = shared[temp2]; | |
int y1 = shared[temp1]; | |
shared[temp2] = oper(y2, y1); | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id == 0) | |
{ | |
scan_buf1[ block ] = shared[1]; | |
scan_buf2[ block ] = shared[0]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void intra_block_inclusive_scan | |
( | |
ulong n, | |
global int * post_sum, | |
global const int * pre_sum, | |
int identity, | |
uint work_per_thread | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t map_id = g_id * work_per_thread; | |
local int shared[1]; | |
size_t offset; | |
int work_sum; | |
if (map_id < n) | |
{ | |
offset = 0; | |
work_sum = pre_sum[map_id]; | |
for( offset = 1; offset < work_per_thread; ++offset ) | |
{ | |
if (map_id + offset < n) work_sum = oper( work_sum, pre_sum[map_id + offset] ); | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int scan_sum = work_sum; | |
shared[ l_id ] = work_sum; | |
for( offset = 1; offset < 1; offset *= 2 ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n && l_id >= offset) scan_sum = oper( scan_sum, shared[ l_id - offset ] ); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared[ l_id ] = scan_sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
work_sum = pre_sum[map_id]; | |
if (l_id > 0) | |
{ | |
work_sum = oper(work_sum, shared[l_id - 1]); | |
post_sum[map_id] = work_sum; | |
} | |
else post_sum[map_id] = work_sum; | |
for( offset = 1; offset < work_per_thread; ++offset ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n && l_id > 0) | |
{ | |
int y = oper(pre_sum[map_id + offset], work_sum); | |
post_sum[ map_id + offset ] = y; | |
work_sum = y; | |
} | |
else | |
{ | |
post_sum[map_id + offset] = oper(pre_sum[map_id + offset], work_sum); | |
work_sum = post_sum[map_id + offset]; | |
} | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_addition | |
( | |
ulong n, | |
global const int * input, | |
global int * output, | |
global int * post_sum, | |
global int * pre_sum, | |
int identity, | |
int exclusive | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
int val; | |
local int shared[1]; | |
if (g_id < n) | |
{ | |
if (exclusive) val = g_id > 0 ? input[g_id - 1] : identity; | |
else val = input[g_id]; | |
} | |
shared[l_id] = val; | |
int scan_result = val; | |
int post_block_sum, new_result; | |
int y1, y2, sum; | |
if(l_id == 0 && g_id < n) | |
{ | |
if(block > 0) | |
{ | |
if(block % 2 == 0) post_block_sum = post_sum[ block/2 - 1 ]; | |
else if(block == 1) post_block_sum = pre_sum[0]; | |
else | |
{ | |
y1 = post_sum[ block/2 - 1 ]; | |
y2 = pre_sum [ block/2]; | |
post_block_sum = oper(y1, y2); | |
} | |
new_result = exclusive ? post_block_sum : oper( scan_result, post_block_sum ); | |
} | |
else new_result = scan_result; | |
shared[ l_id ] = new_result; | |
} | |
sum = shared[ l_id ]; | |
for( size_t offset = 1; offset < 1; offset *= 2 ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id >= offset) sum = oper( sum, shared[ l_id - offset ] ); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared[ l_id ] = sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(g_id < n) output[ g_id ] = sum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int device | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global int * prm_2, | |
int prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = device( prm_2[idx], prm_3 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double oper | |
( | |
double x, | |
double y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_inclusive_scan | |
( | |
ulong n, | |
global const double * input, | |
double identity, | |
global double * scan_buf1, | |
global double * scan_buf2, | |
int exclusive | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
size_t offset = 1; | |
local double shared[2]; | |
if(block * 2 + l_id < n) shared[l_id] = input[block * 2 + l_id]; | |
if(block * 2 + l_id + 1 < n) shared[l_id + 1] = input[block * 2 + l_id + 1]; | |
if(exclusive && g_id == 0) shared[l_id] = oper(identity, input[0]); | |
for (size_t start = 1; start > 0; start >>= 1, offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id < start) | |
{ | |
size_t temp1 = offset * (2 * l_id + 1) - 1; | |
size_t temp2 = offset * (2 * l_id + 2) - 1; | |
double y2 = shared[temp2]; | |
double y1 = shared[temp1]; | |
shared[temp2] = oper(y2, y1); | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id == 0) | |
{ | |
scan_buf1[ block ] = shared[1]; | |
scan_buf2[ block ] = shared[0]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double oper | |
( | |
double x, | |
double y | |
) | |
{ | |
return x + y; | |
} | |
kernel void intra_block_inclusive_scan | |
( | |
ulong n, | |
global double * post_sum, | |
global const double * pre_sum, | |
double identity, | |
uint work_per_thread | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t map_id = g_id * work_per_thread; | |
local double shared[1]; | |
size_t offset; | |
double work_sum; | |
if (map_id < n) | |
{ | |
offset = 0; | |
work_sum = pre_sum[map_id]; | |
for( offset = 1; offset < work_per_thread; ++offset ) | |
{ | |
if (map_id + offset < n) work_sum = oper( work_sum, pre_sum[map_id + offset] ); | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
double scan_sum = work_sum; | |
shared[ l_id ] = work_sum; | |
for( offset = 1; offset < 1; offset *= 2 ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n && l_id >= offset) scan_sum = oper( scan_sum, shared[ l_id - offset ] ); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared[ l_id ] = scan_sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
work_sum = pre_sum[map_id]; | |
if (l_id > 0) | |
{ | |
work_sum = oper(work_sum, shared[l_id - 1]); | |
post_sum[map_id] = work_sum; | |
} | |
else post_sum[map_id] = work_sum; | |
for( offset = 1; offset < work_per_thread; ++offset ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n && l_id > 0) | |
{ | |
double y = oper(pre_sum[map_id + offset], work_sum); | |
post_sum[ map_id + offset ] = y; | |
work_sum = y; | |
} | |
else | |
{ | |
post_sum[map_id + offset] = oper(pre_sum[map_id + offset], work_sum); | |
work_sum = post_sum[map_id + offset]; | |
} | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double oper | |
( | |
double x, | |
double y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_addition | |
( | |
ulong n, | |
global const double * input, | |
global double * output, | |
global double * post_sum, | |
global double * pre_sum, | |
double identity, | |
int exclusive | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
double val; | |
local double shared[1]; | |
if (g_id < n) | |
{ | |
if (exclusive) val = g_id > 0 ? input[g_id - 1] : identity; | |
else val = input[g_id]; | |
} | |
shared[l_id] = val; | |
double scan_result = val; | |
double post_block_sum, new_result; | |
double y1, y2, sum; | |
if(l_id == 0 && g_id < n) | |
{ | |
if(block > 0) | |
{ | |
if(block % 2 == 0) post_block_sum = post_sum[ block/2 - 1 ]; | |
else if(block == 1) post_block_sum = pre_sum[0]; | |
else | |
{ | |
y1 = post_sum[ block/2 - 1 ]; | |
y2 = pre_sum [ block/2]; | |
post_block_sum = oper(y1, y2); | |
} | |
new_result = exclusive ? post_block_sum : oper( scan_result, post_block_sum ); | |
} | |
else new_result = scan_result; | |
shared[ l_id ] = new_result; | |
} | |
sum = shared[ l_id ]; | |
for( size_t offset = 1; offset < 1; offset *= 2 ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id >= offset) sum = oper( sum, shared[ l_id - offset ] ); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared[ l_id ] = sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(g_id < n) output[ g_id ] = sum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double device | |
( | |
double x, | |
double y | |
) | |
{ | |
return x + y; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
global double * prm_2, | |
double prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = device( prm_2[idx], prm_3 ); | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.56 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"scan" end time: Jan 30 11:28 IST | |
"scan" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
23/30 Testing: scan_by_key | |
23/30 Test: scan_by_key | |
Command: "/tmp/vexcl/build/tests/scan_by_key" | |
Directory: /tmp/vexcl/build/tests | |
"scan_by_key" start time: Jan 30 11:28 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597491 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 3 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int x, | |
int y | |
) | |
{ | |
return x == y; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_scan_by_key | |
( | |
ulong n, | |
global const int * ivals, | |
global int * ovals1, | |
global int * ovals2, | |
global const int * ikeys0, | |
global int * okeys0 | |
) | |
{ | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t block = get_group_id(0); | |
size_t offset = 1; | |
size_t pos = block * 2 + l_id; | |
struct Shared | |
{ | |
int vals[2]; | |
int keys0[2]; | |
}; | |
local struct Shared shared; | |
if (pos < n) | |
{ | |
shared.vals[l_id] = ivals[pos]; | |
shared.keys0[l_id] = ikeys0[pos]; | |
} | |
if (pos + 1 < n) | |
{ | |
shared.vals[l_id + 1] = ivals[pos + 1]; | |
shared.keys0[l_id + 1] = ikeys0[pos + 1]; | |
} | |
for(size_t start = 1; start > 0; start /= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id < start) | |
{ | |
size_t temp1 = offset * (2 * l_id + 1) - 1; | |
size_t temp2 = offset * (2 * l_id + 2) - 1; | |
int key10 = shared.keys0[temp1]; | |
int key20 = shared.keys0[temp2]; | |
if (comp(key20, key10)) | |
{ | |
shared.vals[temp2] = oper(shared.vals[temp2], shared.vals[temp1]); | |
} | |
} | |
offset *= 2; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id == 0) | |
{ | |
okeys0[block] = shared.keys0[1]; | |
ovals1[block] = shared.vals[1]; | |
ovals2[block] = shared.vals[0]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int x, | |
int y | |
) | |
{ | |
return x == y; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_inclusive_scan_by_key | |
( | |
ulong n, | |
global int * pre_sum, | |
uint work_per_thread, | |
global const int * key_sum0 | |
) | |
{ | |
size_t block = get_group_id(0); | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t map_id = g_id * work_per_thread; | |
struct Shared | |
{ | |
int vals[1]; | |
int keys0[1]; | |
}; | |
local struct Shared shared; | |
uint offset; | |
int key0; | |
int work_sum; | |
if (map_id < n) | |
{ | |
int prev_key0; | |
offset = 0; | |
key0 = key_sum0[map_id]; | |
work_sum = pre_sum[map_id]; | |
for(offset = 1; offset < work_per_thread; ++offset) | |
{ | |
prev_key0 = key0; | |
key0 = key_sum0[map_id + offset]; | |
if (map_id + offset < n) | |
{ | |
if (comp(key0, prev_key0)) work_sum = oper(work_sum, pre_sum[map_id + offset]); | |
else work_sum = pre_sum[map_id + offset]; | |
pre_sum[map_id + offset] = work_sum; | |
} | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int scan_sum = work_sum; | |
shared.vals[l_id] = work_sum; | |
shared.keys0[l_id] = key0; | |
for(offset = 1; offset < 1; offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n) | |
{ | |
if (l_id >= offset) | |
{ | |
int key10 = shared.keys0[l_id]; | |
int key20 = shared.keys0[l_id - offset]; | |
if (comp(key10, key20)) scan_sum = oper(scan_sum, shared.vals[l_id - offset]); | |
else scan_sum = shared.vals[l_id]; | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[l_id] = scan_sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for(offset = 0; offset < work_per_thread; ++offset) | |
{ | |
barrier(CLK_GLOBAL_MEM_FENCE); | |
if (map_id < n && l_id > 0) | |
{ | |
int y = pre_sum[map_id + offset]; | |
int key10 = key_sum0[map_id + offset]; | |
int key20 = shared.keys0[l_id - 1]; | |
if (comp(key10, key20)) y = oper(y, shared.vals[l_id - 1]); | |
pre_sum[map_id + offset] = y; | |
} | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int x, | |
int y | |
) | |
{ | |
return x == y; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_add_by_key | |
( | |
ulong n, | |
global const int * pre_sum, | |
global const int * pre_sum1, | |
global const int * ivals, | |
global int * ovals, | |
global const int * ikeys0 | |
) | |
{ | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t block = get_group_id(0); | |
struct Shared | |
{ | |
int vals[1]; | |
int keys0[1]; | |
}; | |
local struct Shared shared; | |
int val; | |
int key0; | |
if (g_id < n) | |
{ | |
shared.vals[l_id] =val = ivals[g_id]; | |
shared.keys0[l_id] = key0 = ikeys0[g_id]; | |
} | |
int scan_result = shared.vals[l_id]; | |
int post_sum, new_result, sum; | |
int key10, key20, key30, key40; | |
if (l_id == 0 && g_id < n) | |
{ | |
if (block > 0) | |
{ | |
key10 = ikeys0[g_id]; | |
key20 = ikeys0[block * 1 - 1]; | |
if (block % 2 == 0) post_sum = pre_sum[block / 2 - 1]; | |
else if (block == 1) post_sum = pre_sum1[0]; | |
else | |
{ | |
key30 = ikeys0[block * 1 - 1]; | |
key40 = ikeys0[(block - 1) * 1 - 1]; | |
if (comp(key30, key40)) post_sum = oper(pre_sum[block / 2 - 1], pre_sum1[block / 2]); | |
else post_sum = pre_sum1[block / 2]; | |
} | |
if (comp(key10, key20)) new_result = oper(scan_result, post_sum); | |
else new_result = scan_result; | |
} | |
else new_result = scan_result; | |
shared.vals[l_id] = new_result; | |
} | |
sum = shared.vals[l_id]; | |
for(size_t offset = 1; offset < 1; offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id >= offset) | |
{ | |
key20 = shared.keys0[l_id - offset]; | |
if (comp(key0, key20)) sum = oper(sum, shared.vals[l_id - offset]); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[l_id] = sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (g_id < n) ovals[g_id] = sum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int x, | |
int y | |
) | |
{ | |
return x == y; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_scan_by_key | |
( | |
ulong n, | |
global const int * ivals, | |
global int * ovals1, | |
global int * ovals2, | |
global const int * ikeys0, | |
global int * okeys0, | |
int init | |
) | |
{ | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t block = get_group_id(0); | |
size_t offset = 1; | |
size_t pos = block * 2 + l_id; | |
struct Shared | |
{ | |
int vals[2]; | |
int keys0[2]; | |
}; | |
local struct Shared shared; | |
if (g_id > 0 && pos < n) | |
{ | |
int key10 = ikeys0[pos]; | |
int key20 = ikeys0[pos - 1]; | |
if (comp(key10, key20)) | |
{ | |
shared.vals[l_id] = ivals[pos]; | |
} | |
else | |
{ | |
shared.vals[l_id] = oper(init, ivals[pos]); | |
} | |
shared.keys0[l_id] = ikeys0[pos]; | |
} | |
else | |
{ | |
shared.vals[l_id] = oper(init, ivals[0]); | |
shared.keys0[l_id] = ikeys0[0]; | |
} | |
if (pos + 1 < n) | |
{ | |
int key10 = ikeys0[pos + 1]; | |
int key20 = ikeys0[pos + 1 - 1]; | |
if (comp(key10, key20)) | |
{ | |
shared.vals[l_id + 1] = ivals[pos + 1]; | |
} | |
else | |
{ | |
shared.vals[l_id + 1] = oper(init, ivals[pos + 1]); | |
} | |
shared.keys0[l_id + 1] = ikeys0[pos + 1]; | |
} | |
for(size_t start = 1; start > 0; start /= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id < start) | |
{ | |
size_t temp1 = offset * (2 * l_id + 1) - 1; | |
size_t temp2 = offset * (2 * l_id + 2) - 1; | |
int key10 = shared.keys0[temp1]; | |
int key20 = shared.keys0[temp2]; | |
if (comp(key20, key10)) | |
{ | |
shared.vals[temp2] = oper(shared.vals[temp2], shared.vals[temp1]); | |
} | |
} | |
offset *= 2; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id == 0) | |
{ | |
okeys0[block] = shared.keys0[1]; | |
ovals1[block] = shared.vals[1]; | |
ovals2[block] = shared.vals[0]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int x, | |
int y | |
) | |
{ | |
return x == y; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_inclusive_scan_by_key | |
( | |
ulong n, | |
global int * pre_sum, | |
uint work_per_thread, | |
global const int * key_sum0 | |
) | |
{ | |
size_t block = get_group_id(0); | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t map_id = g_id * work_per_thread; | |
struct Shared | |
{ | |
int vals[1]; | |
int keys0[1]; | |
}; | |
local struct Shared shared; | |
uint offset; | |
int key0; | |
int work_sum; | |
if (map_id < n) | |
{ | |
int prev_key0; | |
offset = 0; | |
key0 = key_sum0[map_id]; | |
work_sum = pre_sum[map_id]; | |
for(offset = 1; offset < work_per_thread; ++offset) | |
{ | |
prev_key0 = key0; | |
key0 = key_sum0[map_id + offset]; | |
if (map_id + offset < n) | |
{ | |
if (comp(key0, prev_key0)) work_sum = oper(work_sum, pre_sum[map_id + offset]); | |
else work_sum = pre_sum[map_id + offset]; | |
pre_sum[map_id + offset] = work_sum; | |
} | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int scan_sum = work_sum; | |
shared.vals[l_id] = work_sum; | |
shared.keys0[l_id] = key0; | |
for(offset = 1; offset < 1; offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n) | |
{ | |
if (l_id >= offset) | |
{ | |
int key10 = shared.keys0[l_id]; | |
int key20 = shared.keys0[l_id - offset]; | |
if (comp(key10, key20)) scan_sum = oper(scan_sum, shared.vals[l_id - offset]); | |
else scan_sum = shared.vals[l_id]; | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[l_id] = scan_sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for(offset = 0; offset < work_per_thread; ++offset) | |
{ | |
barrier(CLK_GLOBAL_MEM_FENCE); | |
if (map_id < n && l_id > 0) | |
{ | |
int y = pre_sum[map_id + offset]; | |
int key10 = key_sum0[map_id + offset]; | |
int key20 = shared.keys0[l_id - 1]; | |
if (comp(key10, key20)) y = oper(y, shared.vals[l_id - 1]); | |
pre_sum[map_id + offset] = y; | |
} | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int x, | |
int y | |
) | |
{ | |
return x == y; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_add_by_key | |
( | |
ulong n, | |
global const int * pre_sum, | |
global const int * pre_sum1, | |
global const int * ivals, | |
global int * ovals, | |
global const int * ikeys0, | |
int init | |
) | |
{ | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t block = get_group_id(0); | |
struct Shared | |
{ | |
int vals[1]; | |
int keys0[1]; | |
}; | |
local struct Shared shared; | |
int val; | |
int key0; | |
if (g_id < n) | |
{ | |
if (g_id > 0) | |
{ | |
int key10 = key0 = ikeys0[g_id]; | |
int key20 = ikeys0[g_id-1]; | |
if (comp(key10, key20)) val = ivals[g_id - 1]; | |
else val = init; | |
shared.vals[l_id] = val; | |
shared.keys0[l_id] = key0; | |
} | |
else | |
{ | |
val = init; | |
shared.vals[l_id] = val; | |
shared.keys0[l_id] = ikeys0[g_id]; | |
} | |
} | |
int scan_result = shared.vals[l_id]; | |
int post_sum, new_result, sum; | |
int key10, key20, key30, key40; | |
if (l_id == 0 && g_id < n) | |
{ | |
if (block > 0) | |
{ | |
key10 = ikeys0[g_id]; | |
key20 = ikeys0[block * 1 - 1]; | |
if (block % 2 == 0) post_sum = pre_sum[block / 2 - 1]; | |
else if (block == 1) post_sum = pre_sum1[0]; | |
else | |
{ | |
key30 = ikeys0[block * 1 - 1]; | |
key40 = ikeys0[(block - 1) * 1 - 1]; | |
if (comp(key30, key40)) post_sum = oper(pre_sum[block / 2 - 1], pre_sum1[block / 2]); | |
else post_sum = pre_sum1[block / 2]; | |
} | |
if (comp(key10, key20)) new_result = post_sum; | |
else new_result = init; | |
} | |
else new_result = scan_result; | |
shared.vals[l_id] = new_result; | |
} | |
sum = shared.vals[l_id]; | |
for(size_t offset = 1; offset < 1; offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id >= offset) | |
{ | |
key20 = shared.keys0[l_id - offset]; | |
if (comp(key0, key20)) sum = oper(sum, shared.vals[l_id - offset]); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[l_id] = sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (g_id < n) ovals[g_id] = sum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
int a2, | |
int b1, | |
int b2 | |
) | |
{ | |
return a1 == b1 && a2 == b2; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_scan_by_key | |
( | |
ulong n, | |
global const int * ivals, | |
global int * ovals1, | |
global int * ovals2, | |
global const int * ikeys0, | |
global const int * ikeys1, | |
global int * okeys0, | |
global int * okeys1 | |
) | |
{ | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t block = get_group_id(0); | |
size_t offset = 1; | |
size_t pos = block * 2 + l_id; | |
struct Shared | |
{ | |
int vals[2]; | |
int keys0[2]; | |
int keys1[2]; | |
}; | |
local struct Shared shared; | |
if (pos < n) | |
{ | |
shared.vals[l_id] = ivals[pos]; | |
shared.keys0[l_id] = ikeys0[pos]; | |
shared.keys1[l_id] = ikeys1[pos]; | |
} | |
if (pos + 1 < n) | |
{ | |
shared.vals[l_id + 1] = ivals[pos + 1]; | |
shared.keys0[l_id + 1] = ikeys0[pos + 1]; | |
shared.keys1[l_id + 1] = ikeys1[pos + 1]; | |
} | |
for(size_t start = 1; start > 0; start /= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id < start) | |
{ | |
size_t temp1 = offset * (2 * l_id + 1) - 1; | |
size_t temp2 = offset * (2 * l_id + 2) - 1; | |
int key10 = shared.keys0[temp1]; | |
int key20 = shared.keys0[temp2]; | |
int key11 = shared.keys1[temp1]; | |
int key21 = shared.keys1[temp2]; | |
if (comp(key20, key21, key10, key11)) | |
{ | |
shared.vals[temp2] = oper(shared.vals[temp2], shared.vals[temp1]); | |
} | |
} | |
offset *= 2; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id == 0) | |
{ | |
okeys0[block] = shared.keys0[1]; | |
okeys1[block] = shared.keys1[1]; | |
ovals1[block] = shared.vals[1]; | |
ovals2[block] = shared.vals[0]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
int a2, | |
int b1, | |
int b2 | |
) | |
{ | |
return a1 == b1 && a2 == b2; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_inclusive_scan_by_key | |
( | |
ulong n, | |
global int * pre_sum, | |
uint work_per_thread, | |
global const int * key_sum0, | |
global const int * key_sum1 | |
) | |
{ | |
size_t block = get_group_id(0); | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t map_id = g_id * work_per_thread; | |
struct Shared | |
{ | |
int vals[1]; | |
int keys0[1]; | |
int keys1[1]; | |
}; | |
local struct Shared shared; | |
uint offset; | |
int key0; | |
int key1; | |
int work_sum; | |
if (map_id < n) | |
{ | |
int prev_key0; | |
int prev_key1; | |
offset = 0; | |
key0 = key_sum0[map_id]; | |
key1 = key_sum1[map_id]; | |
work_sum = pre_sum[map_id]; | |
for(offset = 1; offset < work_per_thread; ++offset) | |
{ | |
prev_key0 = key0; | |
key0 = key_sum0[map_id + offset]; | |
prev_key1 = key1; | |
key1 = key_sum1[map_id + offset]; | |
if (map_id + offset < n) | |
{ | |
if (comp(key0, key1, prev_key0, prev_key1)) work_sum = oper(work_sum, pre_sum[map_id + offset]); | |
else work_sum = pre_sum[map_id + offset]; | |
pre_sum[map_id + offset] = work_sum; | |
} | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int scan_sum = work_sum; | |
shared.vals[l_id] = work_sum; | |
shared.keys0[l_id] = key0; | |
shared.keys1[l_id] = key1; | |
for(offset = 1; offset < 1; offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n) | |
{ | |
if (l_id >= offset) | |
{ | |
int key10 = shared.keys0[l_id]; | |
int key20 = shared.keys0[l_id - offset]; | |
int key11 = shared.keys1[l_id]; | |
int key21 = shared.keys1[l_id - offset]; | |
if (comp(key10, key11, key20, key21)) scan_sum = oper(scan_sum, shared.vals[l_id - offset]); | |
else scan_sum = shared.vals[l_id]; | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[l_id] = scan_sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for(offset = 0; offset < work_per_thread; ++offset) | |
{ | |
barrier(CLK_GLOBAL_MEM_FENCE); | |
if (map_id < n && l_id > 0) | |
{ | |
int y = pre_sum[map_id + offset]; | |
int key10 = key_sum0[map_id + offset]; | |
int key20 = shared.keys0[l_id - 1]; | |
int key11 = key_sum1[map_id + offset]; | |
int key21 = shared.keys1[l_id - 1]; | |
if (comp(key10, key11, key20, key21)) y = oper(y, shared.vals[l_id - 1]); | |
pre_sum[map_id + offset] = y; | |
} | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
int a2, | |
int b1, | |
int b2 | |
) | |
{ | |
return a1 == b1 && a2 == b2; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_add_by_key | |
( | |
ulong n, | |
global const int * pre_sum, | |
global const int * pre_sum1, | |
global const int * ivals, | |
global int * ovals, | |
global const int * ikeys0, | |
global const int * ikeys1 | |
) | |
{ | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t block = get_group_id(0); | |
struct Shared | |
{ | |
int vals[1]; | |
int keys0[1]; | |
int keys1[1]; | |
}; | |
local struct Shared shared; | |
int val; | |
int key0; | |
int key1; | |
if (g_id < n) | |
{ | |
shared.vals[l_id] =val = ivals[g_id]; | |
shared.keys0[l_id] = key0 = ikeys0[g_id]; | |
shared.keys1[l_id] = key1 = ikeys1[g_id]; | |
} | |
int scan_result = shared.vals[l_id]; | |
int post_sum, new_result, sum; | |
int key10, key20, key30, key40; | |
int key11, key21, key31, key41; | |
if (l_id == 0 && g_id < n) | |
{ | |
if (block > 0) | |
{ | |
key10 = ikeys0[g_id]; | |
key20 = ikeys0[block * 1 - 1]; | |
key11 = ikeys1[g_id]; | |
key21 = ikeys1[block * 1 - 1]; | |
if (block % 2 == 0) post_sum = pre_sum[block / 2 - 1]; | |
else if (block == 1) post_sum = pre_sum1[0]; | |
else | |
{ | |
key30 = ikeys0[block * 1 - 1]; | |
key40 = ikeys0[(block - 1) * 1 - 1]; | |
key31 = ikeys1[block * 1 - 1]; | |
key41 = ikeys1[(block - 1) * 1 - 1]; | |
if (comp(key30, key31, key40, key41)) post_sum = oper(pre_sum[block / 2 - 1], pre_sum1[block / 2]); | |
else post_sum = pre_sum1[block / 2]; | |
} | |
if (comp(key10, key11, key20, key21)) new_result = oper(scan_result, post_sum); | |
else new_result = scan_result; | |
} | |
else new_result = scan_result; | |
shared.vals[l_id] = new_result; | |
} | |
sum = shared.vals[l_id]; | |
for(size_t offset = 1; offset < 1; offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id >= offset) | |
{ | |
key20 = shared.keys0[l_id - offset]; | |
key21 = shared.keys1[l_id - offset]; | |
if (comp(key0, key1, key20, key21)) sum = oper(sum, shared.vals[l_id - offset]); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[l_id] = sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (g_id < n) ovals[g_id] = sum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
int a2, | |
int b1, | |
int b2 | |
) | |
{ | |
return a1 == b1 && a2 == b2; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_scan_by_key | |
( | |
ulong n, | |
global const int * ivals, | |
global int * ovals1, | |
global int * ovals2, | |
global const int * ikeys0, | |
global const int * ikeys1, | |
global int * okeys0, | |
global int * okeys1, | |
int init | |
) | |
{ | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t block = get_group_id(0); | |
size_t offset = 1; | |
size_t pos = block * 2 + l_id; | |
struct Shared | |
{ | |
int vals[2]; | |
int keys0[2]; | |
int keys1[2]; | |
}; | |
local struct Shared shared; | |
if (g_id > 0 && pos < n) | |
{ | |
int key10 = ikeys0[pos]; | |
int key20 = ikeys0[pos - 1]; | |
int key11 = ikeys1[pos]; | |
int key21 = ikeys1[pos - 1]; | |
if (comp(key10, key11, key20, key21)) | |
{ | |
shared.vals[l_id] = ivals[pos]; | |
} | |
else | |
{ | |
shared.vals[l_id] = oper(init, ivals[pos]); | |
} | |
shared.keys0[l_id] = ikeys0[pos]; | |
shared.keys1[l_id] = ikeys1[pos]; | |
} | |
else | |
{ | |
shared.vals[l_id] = oper(init, ivals[0]); | |
shared.keys0[l_id] = ikeys0[0]; | |
shared.keys1[l_id] = ikeys1[0]; | |
} | |
if (pos + 1 < n) | |
{ | |
int key10 = ikeys0[pos + 1]; | |
int key20 = ikeys0[pos + 1 - 1]; | |
int key11 = ikeys1[pos + 1]; | |
int key21 = ikeys1[pos + 1 - 1]; | |
if (comp(key10, key11, key20, key21)) | |
{ | |
shared.vals[l_id + 1] = ivals[pos + 1]; | |
} | |
else | |
{ | |
shared.vals[l_id + 1] = oper(init, ivals[pos + 1]); | |
} | |
shared.keys0[l_id + 1] = ikeys0[pos + 1]; | |
shared.keys1[l_id + 1] = ikeys1[pos + 1]; | |
} | |
for(size_t start = 1; start > 0; start /= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id < start) | |
{ | |
size_t temp1 = offset * (2 * l_id + 1) - 1; | |
size_t temp2 = offset * (2 * l_id + 2) - 1; | |
int key10 = shared.keys0[temp1]; | |
int key20 = shared.keys0[temp2]; | |
int key11 = shared.keys1[temp1]; | |
int key21 = shared.keys1[temp2]; | |
if (comp(key20, key21, key10, key11)) | |
{ | |
shared.vals[temp2] = oper(shared.vals[temp2], shared.vals[temp1]); | |
} | |
} | |
offset *= 2; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id == 0) | |
{ | |
okeys0[block] = shared.keys0[1]; | |
okeys1[block] = shared.keys1[1]; | |
ovals1[block] = shared.vals[1]; | |
ovals2[block] = shared.vals[0]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
int a2, | |
int b1, | |
int b2 | |
) | |
{ | |
return a1 == b1 && a2 == b2; | |
} | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_add_by_key | |
( | |
ulong n, | |
global const int * pre_sum, | |
global const int * pre_sum1, | |
global const int * ivals, | |
global int * ovals, | |
global const int * ikeys0, | |
global const int * ikeys1, | |
int init | |
) | |
{ | |
size_t g_id = get_global_id(0); | |
size_t l_id = get_local_id(0); | |
size_t block = get_group_id(0); | |
struct Shared | |
{ | |
int vals[1]; | |
int keys0[1]; | |
int keys1[1]; | |
}; | |
local struct Shared shared; | |
int val; | |
int key0; | |
int key1; | |
if (g_id < n) | |
{ | |
if (g_id > 0) | |
{ | |
int key10 = key0 = ikeys0[g_id]; | |
int key20 = ikeys0[g_id-1]; | |
int key11 = key1 = ikeys1[g_id]; | |
int key21 = ikeys1[g_id-1]; | |
if (comp(key10, key11, key20, key21)) val = ivals[g_id - 1]; | |
else val = init; | |
shared.vals[l_id] = val; | |
shared.keys0[l_id] = key0; | |
shared.keys1[l_id] = key1; | |
} | |
else | |
{ | |
val = init; | |
shared.vals[l_id] = val; | |
shared.keys0[l_id] = ikeys0[g_id]; | |
shared.keys1[l_id] = ikeys1[g_id]; | |
} | |
} | |
int scan_result = shared.vals[l_id]; | |
int post_sum, new_result, sum; | |
int key10, key20, key30, key40; | |
int key11, key21, key31, key41; | |
if (l_id == 0 && g_id < n) | |
{ | |
if (block > 0) | |
{ | |
key10 = ikeys0[g_id]; | |
key20 = ikeys0[block * 1 - 1]; | |
key11 = ikeys1[g_id]; | |
key21 = ikeys1[block * 1 - 1]; | |
if (block % 2 == 0) post_sum = pre_sum[block / 2 - 1]; | |
else if (block == 1) post_sum = pre_sum1[0]; | |
else | |
{ | |
key30 = ikeys0[block * 1 - 1]; | |
key40 = ikeys0[(block - 1) * 1 - 1]; | |
key31 = ikeys1[block * 1 - 1]; | |
key41 = ikeys1[(block - 1) * 1 - 1]; | |
if (comp(key30, key31, key40, key41)) post_sum = oper(pre_sum[block / 2 - 1], pre_sum1[block / 2]); | |
else post_sum = pre_sum1[block / 2]; | |
} | |
if (comp(key10, key11, key20, key21)) new_result = post_sum; | |
else new_result = init; | |
} | |
else new_result = scan_result; | |
shared.vals[l_id] = new_result; | |
} | |
sum = shared.vals[l_id]; | |
for(size_t offset = 1; offset < 1; offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id >= offset) | |
{ | |
key20 = shared.keys0[l_id - offset]; | |
key21 = shared.keys1[l_id - offset]; | |
if (comp(key0, key1, key20, key21)) sum = oper(sum, shared.vals[l_id - offset]); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[l_id] = sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (g_id < n) ovals[g_id] = sum; | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.41 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"scan_by_key" end time: Jan 30 11:28 IST | |
"scan_by_key" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
24/30 Testing: reduce_by_key | |
24/30 Test: reduce_by_key | |
Command: "/tmp/vexcl/build/tests/reduce_by_key" | |
Directory: /tmp/vexcl/build/tests | |
"reduce_by_key" start time: Jan 30 11:28 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597491 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 3 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int x, | |
int y | |
) | |
{ | |
return x == y; | |
} | |
kernel void offset_calculation | |
( | |
ulong n, | |
global const int * keys0, | |
global int * offsets | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
if (idx > 0) offsets[idx] = !comp(keys0[idx - 1], keys0[idx]); | |
else offsets[idx] = 0; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_inclusive_scan | |
( | |
ulong n, | |
global const int * input, | |
int identity, | |
global int * scan_buf1, | |
global int * scan_buf2, | |
int exclusive | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
size_t offset = 1; | |
local int shared[2]; | |
if(block * 2 + l_id < n) shared[l_id] = input[block * 2 + l_id]; | |
if(block * 2 + l_id + 1 < n) shared[l_id + 1] = input[block * 2 + l_id + 1]; | |
if(exclusive && g_id == 0) shared[l_id] = oper(identity, input[0]); | |
for (size_t start = 1; start > 0; start >>= 1, offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id < start) | |
{ | |
size_t temp1 = offset * (2 * l_id + 1) - 1; | |
size_t temp2 = offset * (2 * l_id + 2) - 1; | |
int y2 = shared[temp2]; | |
int y1 = shared[temp1]; | |
shared[temp2] = oper(y2, y1); | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id == 0) | |
{ | |
scan_buf1[ block ] = shared[1]; | |
scan_buf2[ block ] = shared[0]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void intra_block_inclusive_scan | |
( | |
ulong n, | |
global int * post_sum, | |
global const int * pre_sum, | |
int identity, | |
uint work_per_thread | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t map_id = g_id * work_per_thread; | |
local int shared[1]; | |
size_t offset; | |
int work_sum; | |
if (map_id < n) | |
{ | |
offset = 0; | |
work_sum = pre_sum[map_id]; | |
for( offset = 1; offset < work_per_thread; ++offset ) | |
{ | |
if (map_id + offset < n) work_sum = oper( work_sum, pre_sum[map_id + offset] ); | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int scan_sum = work_sum; | |
shared[ l_id ] = work_sum; | |
for( offset = 1; offset < 1; offset *= 2 ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n && l_id >= offset) scan_sum = oper( scan_sum, shared[ l_id - offset ] ); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared[ l_id ] = scan_sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
work_sum = pre_sum[map_id]; | |
if (l_id > 0) | |
{ | |
work_sum = oper(work_sum, shared[l_id - 1]); | |
post_sum[map_id] = work_sum; | |
} | |
else post_sum[map_id] = work_sum; | |
for( offset = 1; offset < work_per_thread; ++offset ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n && l_id > 0) | |
{ | |
int y = oper(pre_sum[map_id + offset], work_sum); | |
post_sum[ map_id + offset ] = y; | |
work_sum = y; | |
} | |
else | |
{ | |
post_sum[map_id + offset] = oper(pre_sum[map_id + offset], work_sum); | |
work_sum = post_sum[map_id + offset]; | |
} | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_addition | |
( | |
ulong n, | |
global const int * input, | |
global int * output, | |
global int * post_sum, | |
global int * pre_sum, | |
int identity, | |
int exclusive | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
int val; | |
local int shared[1]; | |
if (g_id < n) | |
{ | |
if (exclusive) val = g_id > 0 ? input[g_id - 1] : identity; | |
else val = input[g_id]; | |
} | |
shared[l_id] = val; | |
int scan_result = val; | |
int post_block_sum, new_result; | |
int y1, y2, sum; | |
if(l_id == 0 && g_id < n) | |
{ | |
if(block > 0) | |
{ | |
if(block % 2 == 0) post_block_sum = post_sum[ block/2 - 1 ]; | |
else if(block == 1) post_block_sum = pre_sum[0]; | |
else | |
{ | |
y1 = post_sum[ block/2 - 1 ]; | |
y2 = pre_sum [ block/2]; | |
post_block_sum = oper(y1, y2); | |
} | |
new_result = exclusive ? post_block_sum : oper( scan_result, post_block_sum ); | |
} | |
else new_result = scan_result; | |
shared[ l_id ] = new_result; | |
} | |
sum = shared[ l_id ]; | |
for( size_t offset = 1; offset < 1; offset *= 2 ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id >= offset) sum = oper( sum, shared[ l_id - offset ] ); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared[ l_id ] = sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(g_id < n) output[ g_id ] = sum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double oper | |
( | |
double x, | |
double y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_scan_by_key | |
( | |
ulong n, | |
global const int * keys, | |
global const double * vals, | |
global double * output, | |
global int * key_buf, | |
global double * val_buf | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
struct Shared | |
{ | |
int keys[1]; | |
double vals[1]; | |
}; | |
local struct Shared shared; | |
int key; | |
double val; | |
if (g_id < n) | |
{ | |
key = keys[g_id]; | |
val = vals[g_id]; | |
shared.keys[l_id] = key; | |
shared.vals[l_id] = val; | |
} | |
double sum = val; | |
for(size_t offset = 1; offset < 1; offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id >= offset && shared.keys[l_id - offset] == key) | |
{ | |
sum = oper(sum, shared.vals[l_id - offset]); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[l_id] = sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (g_id >= n) return; | |
int key2 = -1; | |
if (g_id < n - 1) key2 = keys[g_id + 1]; | |
if (key != key2) output[g_id] = sum; | |
if (l_id == 0) | |
{ | |
key_buf[block] = shared.keys[0]; | |
val_buf[block] = shared.vals[0]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double oper | |
( | |
double x, | |
double y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_inclusive_scan_by_key | |
( | |
ulong n, | |
global const int * key_sum, | |
global const double * pre_sum, | |
global double * post_sum, | |
uint work_per_thread | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t map_id = g_id * work_per_thread; | |
struct Shared | |
{ | |
int keys[1]; | |
double vals[1]; | |
}; | |
local struct Shared shared; | |
uint offset; | |
int key; | |
double work_sum; | |
if (map_id < n) | |
{ | |
int prev_key; | |
offset = 0; | |
key = key_sum[map_id]; | |
work_sum = pre_sum[map_id]; | |
post_sum[map_id] = work_sum; | |
for( offset = offset + 1; offset < work_per_thread; ++offset ) | |
{ | |
prev_key = key; | |
key = key_sum[ map_id + offset ]; | |
if ( map_id + offset < n ) | |
{ | |
double y = pre_sum[ map_id + offset ]; | |
if ( key == prev_key ) work_sum = oper( work_sum, y ); | |
else work_sum = y; | |
post_sum[ map_id + offset ] = work_sum; | |
} | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[ l_id ] = work_sum; | |
shared.keys[ l_id ] = key; | |
double scan_sum = work_sum; | |
for( offset = 1; offset < 1; offset *= 2 ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n) | |
{ | |
if (l_id >= offset) | |
{ | |
int key1 = shared.keys[ l_id ]; | |
int key2 = shared.keys[ l_id - offset ]; | |
if ( key1 == key2 ) scan_sum = oper( scan_sum, shared.vals[ l_id - offset ] ); | |
else scan_sum = shared.vals[ l_id ]; | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[ l_id ] = scan_sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for( offset = 0; offset < work_per_thread; ++offset ) | |
{ | |
barrier(CLK_GLOBAL_MEM_FENCE); | |
if (map_id < n && l_id > 0) | |
{ | |
double y = post_sum[ map_id + offset ]; | |
int key1 = key_sum [ map_id + offset ]; | |
int key2 = shared.keys[ l_id - 1 ]; | |
if ( key1 == key2 ) y = oper( y, shared.vals[l_id - 1] ); | |
post_sum[ map_id + offset ] = y; | |
} | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double oper | |
( | |
double x, | |
double y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_sum_by_key | |
( | |
ulong n, | |
global const int * key_sum, | |
global const double * post_sum, | |
global const int * keys, | |
global double * output | |
) | |
{ | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
if (g_id >= n) return; | |
int key2 = keys[ g_id ]; | |
int key1 = (block > 0 ) ? key_sum[ block - 1 ] : key2 - 1; | |
int key3 = (g_id < n - 1) ? keys [ g_id + 1 ] : key2 - 1; | |
if (block > 0 && key1 == key2 && key2 != key3) | |
{ | |
double scan_result = output [ g_id ]; | |
double post_block_sum = post_sum[ block - 1 ]; | |
output[ g_id ] = oper( scan_result, post_block_sum ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void key_value_mapping | |
( | |
ulong n, | |
global const int * ikeys0, | |
global int * okeys0, | |
global double * ovals, | |
global int * offset, | |
global const double * ivals | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
int num_sections = offset[n - 1] + 1; | |
int off = offset[idx]; | |
if (idx < (n - 1) && off != offset[idx + 1]) | |
{ | |
okeys0[off] = ikeys0[idx]; | |
ovals[off] = ivals[idx]; | |
} | |
if (idx == (n - 1)) | |
{ | |
okeys0[num_sections - 1] = ikeys0[idx]; | |
ovals[num_sections - 1] = ivals[idx]; | |
} | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
bool comp | |
( | |
int a1, | |
long a2, | |
int b1, | |
long b2 | |
) | |
{ | |
return (a1 == b1) && (a2 == b2); | |
} | |
kernel void offset_calculation | |
( | |
ulong n, | |
global const int * keys0, | |
global const long * keys1, | |
global int * offsets | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
if (idx > 0) offsets[idx] = !comp(keys0[idx - 1], keys1[idx - 1], keys0[idx], keys1[idx]); | |
else offsets[idx] = 0; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_inclusive_scan | |
( | |
ulong n, | |
global const int * input, | |
int identity, | |
global int * scan_buf1, | |
global int * scan_buf2, | |
int exclusive | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
size_t offset = 1; | |
local int shared[2]; | |
if(block * 2 + l_id < n) shared[l_id] = input[block * 2 + l_id]; | |
if(block * 2 + l_id + 1 < n) shared[l_id + 1] = input[block * 2 + l_id + 1]; | |
if(exclusive && g_id == 0) shared[l_id] = oper(identity, input[0]); | |
for (size_t start = 1; start > 0; start >>= 1, offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id < start) | |
{ | |
size_t temp1 = offset * (2 * l_id + 1) - 1; | |
size_t temp2 = offset * (2 * l_id + 2) - 1; | |
int y2 = shared[temp2]; | |
int y1 = shared[temp1]; | |
shared[temp2] = oper(y2, y1); | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id == 0) | |
{ | |
scan_buf1[ block ] = shared[1]; | |
scan_buf2[ block ] = shared[0]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void intra_block_inclusive_scan | |
( | |
ulong n, | |
global int * post_sum, | |
global const int * pre_sum, | |
int identity, | |
uint work_per_thread | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t map_id = g_id * work_per_thread; | |
local int shared[1]; | |
size_t offset; | |
int work_sum; | |
if (map_id < n) | |
{ | |
offset = 0; | |
work_sum = pre_sum[map_id]; | |
for( offset = 1; offset < work_per_thread; ++offset ) | |
{ | |
if (map_id + offset < n) work_sum = oper( work_sum, pre_sum[map_id + offset] ); | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int scan_sum = work_sum; | |
shared[ l_id ] = work_sum; | |
for( offset = 1; offset < 1; offset *= 2 ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n && l_id >= offset) scan_sum = oper( scan_sum, shared[ l_id - offset ] ); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared[ l_id ] = scan_sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
work_sum = pre_sum[map_id]; | |
if (l_id > 0) | |
{ | |
work_sum = oper(work_sum, shared[l_id - 1]); | |
post_sum[map_id] = work_sum; | |
} | |
else post_sum[map_id] = work_sum; | |
for( offset = 1; offset < work_per_thread; ++offset ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n && l_id > 0) | |
{ | |
int y = oper(pre_sum[map_id + offset], work_sum); | |
post_sum[ map_id + offset ] = y; | |
work_sum = y; | |
} | |
else | |
{ | |
post_sum[map_id + offset] = oper(pre_sum[map_id + offset], work_sum); | |
work_sum = post_sum[map_id + offset]; | |
} | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
int oper | |
( | |
int x, | |
int y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_addition | |
( | |
ulong n, | |
global const int * input, | |
global int * output, | |
global int * post_sum, | |
global int * pre_sum, | |
int identity, | |
int exclusive | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
int val; | |
local int shared[1]; | |
if (g_id < n) | |
{ | |
if (exclusive) val = g_id > 0 ? input[g_id - 1] : identity; | |
else val = input[g_id]; | |
} | |
shared[l_id] = val; | |
int scan_result = val; | |
int post_block_sum, new_result; | |
int y1, y2, sum; | |
if(l_id == 0 && g_id < n) | |
{ | |
if(block > 0) | |
{ | |
if(block % 2 == 0) post_block_sum = post_sum[ block/2 - 1 ]; | |
else if(block == 1) post_block_sum = pre_sum[0]; | |
else | |
{ | |
y1 = post_sum[ block/2 - 1 ]; | |
y2 = pre_sum [ block/2]; | |
post_block_sum = oper(y1, y2); | |
} | |
new_result = exclusive ? post_block_sum : oper( scan_result, post_block_sum ); | |
} | |
else new_result = scan_result; | |
shared[ l_id ] = new_result; | |
} | |
sum = shared[ l_id ]; | |
for( size_t offset = 1; offset < 1; offset *= 2 ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id >= offset) sum = oper( sum, shared[ l_id - offset ] ); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared[ l_id ] = sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(g_id < n) output[ g_id ] = sum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double oper | |
( | |
double x, | |
double y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_scan_by_key | |
( | |
ulong n, | |
global const int * keys, | |
global const double * vals, | |
global double * output, | |
global int * key_buf, | |
global double * val_buf | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
struct Shared | |
{ | |
int keys[1]; | |
double vals[1]; | |
}; | |
local struct Shared shared; | |
int key; | |
double val; | |
if (g_id < n) | |
{ | |
key = keys[g_id]; | |
val = vals[g_id]; | |
shared.keys[l_id] = key; | |
shared.vals[l_id] = val; | |
} | |
double sum = val; | |
for(size_t offset = 1; offset < 1; offset *= 2) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (l_id >= offset && shared.keys[l_id - offset] == key) | |
{ | |
sum = oper(sum, shared.vals[l_id - offset]); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[l_id] = sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (g_id >= n) return; | |
int key2 = -1; | |
if (g_id < n - 1) key2 = keys[g_id + 1]; | |
if (key != key2) output[g_id] = sum; | |
if (l_id == 0) | |
{ | |
key_buf[block] = shared.keys[0]; | |
val_buf[block] = shared.vals[0]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double oper | |
( | |
double x, | |
double y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_inclusive_scan_by_key | |
( | |
ulong n, | |
global const int * key_sum, | |
global const double * pre_sum, | |
global double * post_sum, | |
uint work_per_thread | |
) | |
{ | |
size_t l_id = get_local_id(0); | |
size_t g_id = get_global_id(0); | |
size_t map_id = g_id * work_per_thread; | |
struct Shared | |
{ | |
int keys[1]; | |
double vals[1]; | |
}; | |
local struct Shared shared; | |
uint offset; | |
int key; | |
double work_sum; | |
if (map_id < n) | |
{ | |
int prev_key; | |
offset = 0; | |
key = key_sum[map_id]; | |
work_sum = pre_sum[map_id]; | |
post_sum[map_id] = work_sum; | |
for( offset = offset + 1; offset < work_per_thread; ++offset ) | |
{ | |
prev_key = key; | |
key = key_sum[ map_id + offset ]; | |
if ( map_id + offset < n ) | |
{ | |
double y = pre_sum[ map_id + offset ]; | |
if ( key == prev_key ) work_sum = oper( work_sum, y ); | |
else work_sum = y; | |
post_sum[ map_id + offset ] = work_sum; | |
} | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[ l_id ] = work_sum; | |
shared.keys[ l_id ] = key; | |
double scan_sum = work_sum; | |
for( offset = 1; offset < 1; offset *= 2 ) | |
{ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (map_id < n) | |
{ | |
if (l_id >= offset) | |
{ | |
int key1 = shared.keys[ l_id ]; | |
int key2 = shared.keys[ l_id - offset ]; | |
if ( key1 == key2 ) scan_sum = oper( scan_sum, shared.vals[ l_id - offset ] ); | |
else scan_sum = shared.vals[ l_id ]; | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
shared.vals[ l_id ] = scan_sum; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for( offset = 0; offset < work_per_thread; ++offset ) | |
{ | |
barrier(CLK_GLOBAL_MEM_FENCE); | |
if (map_id < n && l_id > 0) | |
{ | |
double y = post_sum[ map_id + offset ]; | |
int key1 = key_sum [ map_id + offset ]; | |
int key2 = shared.keys[ l_id - 1 ]; | |
if ( key1 == key2 ) y = oper( y, shared.vals[l_id - 1] ); | |
post_sum[ map_id + offset ] = y; | |
} | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double oper | |
( | |
double x, | |
double y | |
) | |
{ | |
return x + y; | |
} | |
kernel void block_sum_by_key | |
( | |
ulong n, | |
global const int * key_sum, | |
global const double * post_sum, | |
global const int * keys, | |
global double * output | |
) | |
{ | |
size_t g_id = get_global_id(0); | |
size_t block = get_group_id(0); | |
if (g_id >= n) return; | |
int key2 = keys[ g_id ]; | |
int key1 = (block > 0 ) ? key_sum[ block - 1 ] : key2 - 1; | |
int key3 = (g_id < n - 1) ? keys [ g_id + 1 ] : key2 - 1; | |
if (block > 0 && key1 == key2 && key2 != key3) | |
{ | |
double scan_result = output [ g_id ]; | |
double post_block_sum = post_sum[ block - 1 ]; | |
output[ g_id ] = oper( scan_result, post_block_sum ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void key_value_mapping | |
( | |
ulong n, | |
global const int * ikeys0, | |
global const long * ikeys1, | |
global int * okeys0, | |
global long * okeys1, | |
global double * ovals, | |
global int * offset, | |
global const double * ivals | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
int num_sections = offset[n - 1] + 1; | |
int off = offset[idx]; | |
if (idx < (n - 1) && off != offset[idx + 1]) | |
{ | |
okeys0[off] = ikeys0[idx]; | |
okeys1[off] = ikeys1[idx]; | |
ovals[off] = ivals[idx]; | |
} | |
if (idx == (n - 1)) | |
{ | |
okeys0[num_sections - 1] = ikeys0[idx]; | |
okeys1[num_sections - 1] = ikeys1[idx]; | |
ovals[num_sections - 1] = ivals[idx]; | |
} | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 2.45 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"reduce_by_key" end time: Jan 30 11:28 IST | |
"reduce_by_key" time elapsed: 00:00:02 | |
---------------------------------------------------------- | |
25/30 Testing: logical | |
25/30 Test: logical | |
Command: "/tmp/vexcl/build/tests/logical" | |
Directory: /tmp/vexcl/build/tests | |
"logical" start time: Jan 30 11:28 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597494 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 2 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
ulong prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = (prm_2 + idx); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_any_of_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global char * result | |
) | |
{ | |
for(ulong idx = 0; idx < n; ++idx) | |
{ | |
if (prm_1[idx]) | |
{ | |
result[0] = 1; | |
return; | |
} | |
} | |
result[0] = 0; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_any_of_kernel | |
( | |
ulong n, | |
int prm_1, | |
global int * prm_2, | |
global char * result | |
) | |
{ | |
for(ulong idx = 0; idx < n; ++idx) | |
{ | |
if (( prm_1 * prm_2[idx] )) | |
{ | |
result[0] = 1; | |
return; | |
} | |
} | |
result[0] = 0; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_any_of_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
ulong prm_2, | |
global char * result | |
) | |
{ | |
for(ulong idx = 0; idx < n; ++idx) | |
{ | |
if (( prm_1[idx] > prm_2 )) | |
{ | |
result[0] = 1; | |
return; | |
} | |
} | |
result[0] = 0; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_any_of_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
int prm_2, | |
global char * result | |
) | |
{ | |
for(ulong idx = 0; idx < n; ++idx) | |
{ | |
if (( prm_1[idx] < prm_2 )) | |
{ | |
result[0] = 1; | |
return; | |
} | |
} | |
result[0] = 0; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_any_of_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global char * result | |
) | |
{ | |
for(ulong idx = 0; idx < n; ++idx) | |
{ | |
if (( !( prm_1[idx] ) )) | |
{ | |
result[0] = 1; | |
return; | |
} | |
} | |
result[0] = 0; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_any_of_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
int prm_2, | |
int prm_3, | |
global char * result | |
) | |
{ | |
for(ulong idx = 0; idx < n; ++idx) | |
{ | |
if (( !( ( ( prm_1[idx] + prm_2 ) > prm_3 ) ) )) | |
{ | |
result[0] = 1; | |
return; | |
} | |
} | |
result[0] = 0; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_any_of_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
ulong prm_2, | |
global char * result | |
) | |
{ | |
for(ulong idx = 0; idx < n; ++idx) | |
{ | |
if (( !( ( prm_1[idx] > prm_2 ) ) )) | |
{ | |
result[0] = 1; | |
return; | |
} | |
} | |
result[0] = 0; | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.25 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"logical" end time: Jan 30 11:28 IST | |
"logical" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
26/30 Testing: threads | |
26/30 Test: threads | |
Command: "/tmp/vexcl/build/tests/threads" | |
Directory: /tmp/vexcl/build/tests | |
"threads" start time: Jan 30 11:28 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597494 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 2 test cases... | |
##iiff ddeeffiinneedd((ccll__kkhhrr__ffpp6644)) | |
## pprraaggmmaa OOPPEENNCCLL EEXXTTEENNSSIIOONN ccll__kkhhrr__ffpp6644:: eennaabbllee | |
##eelliiff ddeeffiinneedd((ccll__aammdd__ffpp6644)) | |
## pprraaggmmaa OOPPEENNCCLL EEXXTTEENNSSIIOONN ccll__aammdd__ffpp6644:: eennaabbllee | |
##eennddiiff | |
kkeerrnneell vvooiidd vveexxccll__vveeccttoorr__kkeerrnneell | |
(( | |
uulloonngg nn,, | |
gglloobbaall iinntt ** pprrmm__11,, | |
iinntt pprrmm__22 | |
)) | |
{{ | |
uulloonngg cchhuunnkk__ssiizzee == ((nn ++ ggeett__gglloobbaall__ssiizzee((00)) -- 11)) // ggeett__gglloobbaall__ssiizzee((00));; | |
uulloonngg cchhuunnkk__ssttaarrtt == ggeett__gglloobbaall__iidd((00)) ** cchhuunnkk__ssiizzee;; | |
uulloonngg cchhuunnkk__eenndd == cchhuunnkk__ssttaarrtt ++ cchhuunnkk__ssiizzee;; | |
iiff ((nn << cchhuunnkk__eenndd)) cchhuunnkk__eenndd == nn;; | |
ffoorr((uulloonngg iiddxx == cchhuunnkk__ssttaarrtt;; iiddxx << cchhuunnkk__eenndd;; ++++iiddxx)) | |
{{ | |
pprrmm__11[[iiddxx]] == pprrmm__22;; | |
}} | |
}} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
long SUM_long | |
( | |
long prm1, | |
long prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global long * g_odata | |
) | |
{ | |
long mySum = (long)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; #+i+f defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
long SUM_long | |
( | |
long prm1, | |
long prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global int * prm_1, | |
global long * g_odata | |
) | |
{ | |
long mySum = (long)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + cihunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_long(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
dx) | |
{ | |
mySum = SUM_long(mySum, prm_1[idx]); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.07 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"threads" end time: Jan 30 11:28 IST | |
"threads" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
27/30 Testing: multiple_objects | |
27/30 Test: multiple_objects | |
Command: "/tmp/vexcl/build/tests/multiple_objects" | |
Directory: /tmp/vexcl/build/tests | |
"multiple_objects" start time: Jan 30 11:28 IST | |
Output: | |
---------------------------------------------------------- | |
Running 1 test case... | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
*** No errors detected | |
<end of output> | |
Test time = 0.02 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"multiple_objects" end time: Jan 30 11:28 IST | |
"multiple_objects" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
28/30 Testing: boost_compute_sort | |
28/30 Test: boost_compute_sort | |
Command: "/tmp/vexcl/build/tests/boost_compute_sort" | |
Directory: /tmp/vexcl/build/tests | |
"boost_compute_sort" start time: Jan 30 11:28 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597494 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 2 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
void philox_uint_2_10 | |
( | |
uint * ctr, | |
uint * key | |
) | |
{ | |
uint m[2]; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
} | |
float random_float_philox | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
union | |
{ | |
uint ctr[2]; | |
uint res_i[1]; | |
float res_f[1]; | |
float res; | |
} ctr; | |
uint key[1]; | |
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; | |
key[0] = 0x12345678; | |
philox_uint_2_10(ctr.ctr, key); | |
ctr.res_f[0] = ctr.res_i[0] / 4294967295.0f; | |
return ctr.res; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
ulong prm_2, | |
int prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = random_float_philox( (prm_2 + idx), prm_3 ); | |
} | |
} | |
/Users/Rajesh/GDrive/codebase/repos/Qwixie/ext/headers/boost/compute/command_queue.hpp:1247: fatal error in "boost::compute::event boost::compute::command_queue::enqueue_nd_range_kernel(const boost::compute::kernel &, size_t, const size_t *, const size_t *, const size_t *, const boost::compute::wait_list &)": std::exception: Invalid Work Group Size | |
/tmp/vexcl/tests/context_setup.hpp:100: last checkpoint | |
*** 1 failure detected in test suite "BoostComputeSort" | |
<end of output> | |
Test time = 0.15 sec | |
---------------------------------------------------------- | |
Test Failed. | |
"boost_compute_sort" end time: Jan 30 11:28 IST | |
"boost_compute_sort" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
29/30 Testing: boost_compute_scan | |
29/30 Test: boost_compute_scan | |
Command: "/tmp/vexcl/build/tests/boost_compute_scan" | |
Directory: /tmp/vexcl/build/tests | |
"boost_compute_scan" start time: Jan 30 11:28 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597494 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 3 test cases... | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2[idx] + prm_3[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double * prm_1, | |
double prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] += prm_2; | |
} | |
} | |
*** No errors detected | |
<end of output> | |
Test time = 0.13 sec | |
---------------------------------------------------------- | |
Test Passed. | |
"boost_compute_scan" end time: Jan 30 11:28 IST | |
"boost_compute_scan" time elapsed: 00:00:00 | |
---------------------------------------------------------- | |
30/30 Testing: fft | |
30/30 Test: fft | |
Command: "/tmp/vexcl/build/tests/fft" | |
Directory: /tmp/vexcl/build/tests | |
"fft" start time: Jan 30 11:28 IST | |
Output: | |
---------------------------------------------------------- | |
seed: 1422597494 | |
1. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
2. Intel(R) Core(TM) i7-2820QM CPU @ 2.30GHz (Apple) | |
Running 4 test cases... | |
#define DEVICE | |
typedef float real_t; | |
typedef float2 real2_t; | |
float2 mul | |
( | |
float2 a, | |
float2 b | |
) | |
{ | |
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
float2 twiddle | |
( | |
float alpha | |
) | |
{ | |
float2 r = {native_cos(alpha), native_sin(alpha)}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const float2 * x, | |
global float2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
float2 v0 = x[0 * threads]; | |
float2 v1 = x[1 * threads]; | |
float2 v2 = x[2 * threads]; | |
float2 v3 = x[3 * threads]; | |
float2 v4 = x[4 * threads]; | |
float2 v5 = x[5 * threads]; | |
float2 v6 = x[6 * threads]; | |
float2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p)); | |
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p)); | |
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p)); | |
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p)); | |
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p)); | |
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p)); | |
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
typedef float real_t; | |
typedef float2 real2_t; | |
float2 mul | |
( | |
float2 a, | |
float2 b | |
) | |
{ | |
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
float2 twiddle | |
( | |
float alpha | |
) | |
{ | |
float2 r = {native_cos(alpha), native_sin(alpha)}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const float2 * x, | |
global float2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
float2 v0 = x[0 * threads]; | |
float2 v1 = x[1 * threads]; | |
float2 v2 = x[2 * threads]; | |
float2 v3 = x[3 * threads]; | |
float2 v4 = x[4 * threads]; | |
float2 v5 = x[5 * threads]; | |
float2 v6 = x[6 * threads]; | |
float2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p)); | |
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p)); | |
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p)); | |
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p)); | |
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p)); | |
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p)); | |
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
typedef float real_t; | |
typedef float2 real2_t; | |
float2 mul | |
( | |
float2 a, | |
float2 b | |
) | |
{ | |
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
float2 twiddle | |
( | |
float alpha | |
) | |
{ | |
float2 r = {native_cos(alpha), native_sin(alpha)}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 16 -name dft16 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 144 FP additions, 40 FP multiplications, | |
* (or, 104 additions, 0 multiplications, 40 fused multiply/add), | |
* 97 stack variables, 3 constants, and 64 memory accesses | |
*/ | |
DEVICE void | |
dft16 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9, | |
real2_t * u10, real2_t * u11, real2_t * u12, real2_t * u13, | |
real2_t * u14, real2_t * u15) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
real2_t v8 = *u8; | |
real2_t v9 = *u9; | |
real2_t v10 = *u10; | |
real2_t v11 = *u11; | |
real2_t v12 = *u12; | |
real2_t v13 = *u13; | |
real2_t v14 = *u14; | |
real2_t v15 = *u15; | |
{ | |
const real_t KP923879532 = | |
+0.923879532511286756128183189396788286822416626; | |
const real_t KP414213562 = | |
+0.414213562373095048801688724209698078569671875; | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t T1z, T1L, T1M, T1N, T1P, T1J, T1K, T1G, T1O, T1Q; | |
{ | |
real_t T1l, T1H, T1R, T7, T1x, TN, TC, T25, T1E, T1b, T1Z, Tt, | |
T2h, T22, T1D; | |
real_t T1g, T1n, TQ, Te, T26, TT, T1m, TJ, T1S, T12, Tj, T11, | |
Ti, T1V, TZ; | |
real_t Tk, T13; | |
{ | |
real_t T1d, Tq, T1c, Tp, T20, T1a, Tr, T1e; | |
{ | |
real_t Tz, T4, TL, T3, T1k, Ty, T5, TA; | |
{ | |
real_t Tw, T1, T2, Tx; | |
Tw = v0.y; | |
T1 = v0.x; | |
T2 = v8.x; | |
Tx = v8.y; | |
Tz = v4.y; | |
T4 = v4.x; | |
TL = T1 - T2; | |
T3 = T1 + T2; | |
T1k = Tw - Tx; | |
Ty = Tw + Tx; | |
T5 = v12.x; | |
TA = v12.y; | |
} | |
{ | |
real_t T18, Tn, To, T19; | |
T18 = v15.y; | |
Tn = v15.x; | |
{ | |
real_t T1j, T6, TM, TB; | |
T1j = T4 - T5; | |
T6 = T4 + T5; | |
TM = Tz - TA; | |
TB = Tz + TA; | |
T1l = T1j + T1k; | |
T1H = T1k - T1j; | |
T1R = T3 - T6; | |
T7 = T3 + T6; | |
T1x = TL + TM; | |
TN = TL - TM; | |
TC = Ty + TB; | |
T25 = Ty - TB; | |
To = v7.x; | |
T19 = v7.y; | |
} | |
T1d = v3.y; | |
Tq = v3.x; | |
T1c = Tn - To; | |
Tp = Tn + To; | |
T20 = T18 + T19; | |
T1a = T18 - T19; | |
Tr = v11.x; | |
T1e = v11.y; | |
} | |
} | |
{ | |
real_t TG, Tb, TP, Ta, TO, TF, Tc, TH; | |
{ | |
real_t TD, T8, T9, TE; | |
TD = v2.y; | |
T8 = v2.x; | |
{ | |
real_t T17, Ts, T21, T1f; | |
T17 = Tq - Tr; | |
Ts = Tq + Tr; | |
T21 = T1d + T1e; | |
T1f = T1d - T1e; | |
T1E = T1a - T17; | |
T1b = T17 + T1a; | |
T1Z = Tp - Ts; | |
Tt = Tp + Ts; | |
T2h = T20 + T21; | |
T22 = T20 - T21; | |
T1D = T1c + T1f; | |
T1g = T1c - T1f; | |
T9 = v10.x; | |
TE = v10.y; | |
} | |
TG = v14.y; | |
Tb = v14.x; | |
TP = T8 - T9; | |
Ta = T8 + T9; | |
TO = TD - TE; | |
TF = TD + TE; | |
Tc = v6.x; | |
TH = v6.y; | |
} | |
{ | |
real_t TR, Td, TS, TI; | |
T1n = TP + TO; | |
TQ = TO - TP; | |
TR = Tb - Tc; | |
Td = Tb + Tc; | |
TS = TG - TH; | |
TI = TG + TH; | |
Te = Ta + Td; | |
T26 = Td - Ta; | |
TT = TR + TS; | |
T1m = TR - TS; | |
TJ = TF + TI; | |
T1S = TF - TI; | |
} | |
} | |
{ | |
real_t TX, Tg, Th, TY; | |
TX = v1.y; | |
Tg = v1.x; | |
Th = v9.x; | |
TY = v9.y; | |
T12 = v5.y; | |
Tj = v5.x; | |
T11 = Tg - Th; | |
Ti = Tg + Th; | |
T1V = TX + TY; | |
TZ = TX - TY; | |
Tk = v13.x; | |
T13 = v13.y; | |
} | |
} | |
{ | |
real_t T2f, T1B, T10, T1U, T1X, T1A, T15, Tv, TK, T2i; | |
{ | |
real_t Tf, Tu, T2j, T2k, T2g; | |
T2f = T7 - Te; | |
Tf = T7 + Te; | |
{ | |
real_t TW, Tl, T1W, T14, Tm; | |
TW = Tj - Tk; | |
Tl = Tj + Tk; | |
T1W = T12 + T13; | |
T14 = T12 - T13; | |
T1B = TZ - TW; | |
T10 = TW + TZ; | |
T1U = Ti - Tl; | |
Tm = Ti + Tl; | |
T2g = T1V + T1W; | |
T1X = T1V - T1W; | |
T1A = T11 + T14; | |
T15 = T11 - T14; | |
Tu = Tm + Tt; | |
Tv = Tt - Tm; | |
} | |
TK = TC - TJ; | |
T2j = TC + TJ; | |
T2k = T2g + T2h; | |
T2i = T2g - T2h; | |
v0.x = Tf + Tu; | |
v0.y = T2j + T2k; | |
v8.y = T2j - T2k; | |
v8.x = Tf - Tu; | |
} | |
{ | |
real_t T29, T1T, T27, T2d, T2a, T2b, T28, T24, T1Y, | |
T23; | |
T29 = T1R - T1S; | |
T1T = T1R + T1S; | |
v4.y = Tv + TK; | |
v4.x = T2f + T2i; | |
v12.x = T2f - T2i; | |
v12.y = TK - Tv; | |
T27 = T25 - T26; | |
T2d = T26 + T25; | |
T2a = T1X - T1U; | |
T1Y = T1U + T1X; | |
T23 = T1Z - T22; | |
T2b = T1Z + T22; | |
T28 = T23 - T1Y; | |
T24 = T1Y + T23; | |
{ | |
real_t T1I, TV, T1v, T1y, T1t, T1s, T1r, T1p, T1q, | |
T1i; | |
{ | |
real_t T1o, T2e, T2c, TU, T16, T1h; | |
T1I = TQ + TT; | |
TU = TQ - TT; | |
T2e = T2a + T2b; | |
T2c = T2a - T2b; | |
TV = fma (KP707106781, TU, TN); | |
T1v = fma (-KP707106781, TU, TN); | |
v10.x = fma (-KP707106781, T24, T1T); | |
v10.y = fma (-KP707106781, T2e, T2d); | |
v2.y = fma (KP707106781, T2e, T2d); | |
v2.x = fma (KP707106781, T24, T1T); | |
v6.y = fma (KP707106781, T28, T27); | |
v6.x = fma (KP707106781, T2c, T29); | |
v14.x = fma (-KP707106781, T2c, T29); | |
v14.y = fma (-KP707106781, T28, T27); | |
T1o = T1m - T1n; | |
T1y = T1n + T1m; | |
T1t = fma (-KP414213562, T10, T15); | |
T16 = fma (KP414213562, T15, T10); | |
T1h = fma (-KP414213562, T1g, T1b); | |
T1s = fma (KP414213562, T1b, T1g); | |
T1r = fma (KP707106781, T1o, T1l); | |
T1p = fma (-KP707106781, T1o, T1l); | |
T1q = T16 + T1h; | |
T1i = T16 - T1h; | |
} | |
{ | |
real_t T1w, T1u, T1C, T1F; | |
T1w = T1t + T1s; | |
T1u = T1s - T1t; | |
T1z = fma (KP707106781, T1y, T1x); | |
T1L = fma (-KP707106781, T1y, T1x); | |
v15.y = fma (KP923879532, T1q, T1p); | |
v15.x = fma (KP923879532, T1w, T1v); | |
v7.x = fma (-KP923879532, T1w, T1v); | |
v7.y = fma (-KP923879532, T1q, T1p); | |
v3.x = fma (KP923879532, T1i, TV); | |
v3.y = fma (KP923879532, T1u, T1r); | |
v11.y = fma (-KP923879532, T1u, T1r); | |
v11.x = fma (-KP923879532, T1i, TV); | |
T1M = fma (-KP414213562, T1A, T1B); | |
T1C = fma (KP414213562, T1B, T1A); | |
T1F = fma (-KP414213562, T1E, T1D); | |
T1N = fma (KP414213562, T1D, T1E); | |
T1P = fma (KP707106781, T1I, T1H); | |
T1J = fma (-KP707106781, T1I, T1H); | |
T1K = T1F - T1C; | |
T1G = T1C + T1F; | |
} | |
} | |
} | |
} | |
} | |
T1O = T1M - T1N; | |
T1Q = T1M + T1N; | |
v1.x = fma (KP923879532, T1G, T1z); | |
v1.y = fma (KP923879532, T1Q, T1P); | |
v9.y = fma (-KP923879532, T1Q, T1P); | |
v9.x = fma (-KP923879532, T1G, T1z); | |
v5.y = fma (KP923879532, T1K, T1J); | |
v5.x = fma (KP923879532, T1O, T1L); | |
v13.x = fma (-KP923879532, T1O, T1L); | |
v13.y = fma (-KP923879532, T1K, T1J); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
*u8 = v8; | |
*u9 = v9; | |
*u10 = v10; | |
*u11 = v11; | |
*u12 = v12; | |
*u13 = v13; | |
*u14 = v14; | |
*u15 = v15; | |
} | |
kernel void radix | |
( | |
global const float2 * x, | |
global float2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 16; | |
x += i + batch_offset; | |
float2 v0 = x[0 * threads]; | |
float2 v1 = x[1 * threads]; | |
float2 v2 = x[2 * threads]; | |
float2 v3 = x[3 * threads]; | |
float2 v4 = x[4 * threads]; | |
float2 v5 = x[5 * threads]; | |
float2 v6 = x[6 * threads]; | |
float2 v7 = x[7 * threads]; | |
float2 v8 = x[8 * threads]; | |
float2 v9 = x[9 * threads]; | |
float2 v10 = x[10 * threads]; | |
float2 v11 = x[11 * threads]; | |
float2 v12 = x[12 * threads]; | |
float2 v13 = x[13 * threads]; | |
float2 v14 = x[14 * threads]; | |
float2 v15 = x[15 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((float)-0.3926990926265717 * k / p)); | |
v2 = mul(v2, twiddle((float)-0.7853981852531433 * k / p)); | |
v3 = mul(v3, twiddle((float)-1.178097248077393 * k / p)); | |
v4 = mul(v4, twiddle((float)-1.570796370506287 * k / p)); | |
v5 = mul(v5, twiddle((float)-1.963495492935181 * k / p)); | |
v6 = mul(v6, twiddle((float)-2.356194496154785 * k / p)); | |
v7 = mul(v7, twiddle((float)-2.748893737792969 * k / p)); | |
v8 = mul(v8, twiddle((float)-3.141592741012573 * k / p)); | |
v9 = mul(v9, twiddle((float)-3.534291744232178 * k / p)); | |
v10 = mul(v10, twiddle((float)-3.926990985870361 * k / p)); | |
v11 = mul(v11, twiddle((float)-4.319690227508545 * k / p)); | |
v12 = mul(v12, twiddle((float)-4.71238899230957 * k / p)); | |
v13 = mul(v13, twiddle((float)-5.105088233947754 * k / p)); | |
v14 = mul(v14, twiddle((float)-5.497787475585938 * k / p)); | |
v15 = mul(v15, twiddle((float)-5.890486240386963 * k / p)); | |
} | |
dft16(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15); | |
const size_t j = k + (i - k) * 16; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
y[8 * p] = v8; | |
y[9 * p] = v9; | |
y[10 * p] = v10; | |
y[11 * p] = v11; | |
y[12 * p] = v12; | |
y[13 * p] = v13; | |
y[14 * p] = v14; | |
y[15 * p] = v15; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
float2 r2c | |
( | |
float v | |
) | |
{ | |
float2 r = {v, 0}; return r; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float2 * prm_1, | |
global float * prm_2, | |
global float * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = r2c( ( prm_2[idx] * prm_3[idx] ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
float c2r | |
( | |
float2 v | |
) | |
{ | |
return v.x; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
float prm_2, | |
global float2 * prm_3, | |
int prm_4 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] += ( ( prm_2 * c2r( prm_3[idx] ) ) * prm_4 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
void philox_uint_2_10 | |
( | |
uint * ctr, | |
uint * key | |
) | |
{ | |
uint m[2]; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
key[0] += 0x9E3779B9; | |
m[0] = mul_hi(0xD256D193, ctr[0]); | |
m[1] = 0xD256D193 * ctr[0]; | |
ctr[0] = m[0] ^ key[0] ^ ctr[1]; | |
ctr[1] = m[1]; | |
} | |
float random_float_philox | |
( | |
ulong prm1, | |
ulong prm2 | |
) | |
{ | |
union | |
{ | |
uint ctr[2]; | |
uint res_i[1]; | |
float res_f[1]; | |
float res; | |
} ctr; | |
uint key[1]; | |
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2; | |
key[0] = 0x12345678; | |
philox_uint_2_10(ctr.ctr, key); | |
ctr.res_f[0] = ctr.res_i[0] / 4294967295.0f; | |
return ctr.res; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
ulong prm_2, | |
int prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = random_float_philox( (prm_2 + idx), prm_3 ); | |
} | |
} | |
#define DEVICE | |
typedef float real_t; | |
typedef float2 real2_t; | |
float2 mul | |
( | |
float2 a, | |
float2 b | |
) | |
{ | |
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
float2 twiddle | |
( | |
float alpha | |
) | |
{ | |
float2 r = {native_cos(alpha), native_sin(alpha)}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const float2 * x, | |
global float2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
float2 v0 = x[0 * threads]; | |
float2 v1 = x[1 * threads]; | |
float2 v2 = x[2 * threads]; | |
float2 v3 = x[3 * threads]; | |
float2 v4 = x[4 * threads]; | |
float2 v5 = x[5 * threads]; | |
float2 v6 = x[6 * threads]; | |
float2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p)); | |
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p)); | |
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p)); | |
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p)); | |
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p)); | |
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p)); | |
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
typedef float real_t; | |
typedef float2 real2_t; | |
float2 mul | |
( | |
float2 a, | |
float2 b | |
) | |
{ | |
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
float2 twiddle | |
( | |
float alpha | |
) | |
{ | |
float2 r = {native_cos(alpha), native_sin(alpha)}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const float2 * x, | |
global float2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
float2 v0 = x[0 * threads]; | |
float2 v1 = x[1 * threads]; | |
float2 v2 = x[2 * threads]; | |
float2 v3 = x[3 * threads]; | |
float2 v4 = x[4 * threads]; | |
float2 v5 = x[5 * threads]; | |
float2 v6 = x[6 * threads]; | |
float2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p)); | |
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p)); | |
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p)); | |
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p)); | |
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p)); | |
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p)); | |
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
typedef float real_t; | |
typedef float2 real2_t; | |
float2 mul | |
( | |
float2 a, | |
float2 b | |
) | |
{ | |
float2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
float2 twiddle | |
( | |
float alpha | |
) | |
{ | |
float2 r = {native_cos(alpha), native_sin(alpha)}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 16 -name dft16 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 144 FP additions, 40 FP multiplications, | |
* (or, 104 additions, 0 multiplications, 40 fused multiply/add), | |
* 97 stack variables, 3 constants, and 64 memory accesses | |
*/ | |
DEVICE void | |
dft16 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9, | |
real2_t * u10, real2_t * u11, real2_t * u12, real2_t * u13, | |
real2_t * u14, real2_t * u15) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
real2_t v8 = *u8; | |
real2_t v9 = *u9; | |
real2_t v10 = *u10; | |
real2_t v11 = *u11; | |
real2_t v12 = *u12; | |
real2_t v13 = *u13; | |
real2_t v14 = *u14; | |
real2_t v15 = *u15; | |
{ | |
const real_t KP923879532 = | |
+0.923879532511286756128183189396788286822416626; | |
const real_t KP414213562 = | |
+0.414213562373095048801688724209698078569671875; | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t T1z, T1L, T1M, T1N, T1P, T1J, T1K, T1G, T1O, T1Q; | |
{ | |
real_t T1l, T1H, T1R, T7, T1x, TN, TC, T25, T1E, T1b, T1Z, Tt, | |
T2h, T22, T1D; | |
real_t T1g, T1n, TQ, Te, T26, TT, T1m, TJ, T1S, T12, Tj, T11, | |
Ti, T1V, TZ; | |
real_t Tk, T13; | |
{ | |
real_t T1d, Tq, T1c, Tp, T20, T1a, Tr, T1e; | |
{ | |
real_t Tz, T4, TL, T3, T1k, Ty, T5, TA; | |
{ | |
real_t Tw, T1, T2, Tx; | |
Tw = v0.y; | |
T1 = v0.x; | |
T2 = v8.x; | |
Tx = v8.y; | |
Tz = v4.y; | |
T4 = v4.x; | |
TL = T1 - T2; | |
T3 = T1 + T2; | |
T1k = Tw - Tx; | |
Ty = Tw + Tx; | |
T5 = v12.x; | |
TA = v12.y; | |
} | |
{ | |
real_t T18, Tn, To, T19; | |
T18 = v15.y; | |
Tn = v15.x; | |
{ | |
real_t T1j, T6, TM, TB; | |
T1j = T4 - T5; | |
T6 = T4 + T5; | |
TM = Tz - TA; | |
TB = Tz + TA; | |
T1l = T1j + T1k; | |
T1H = T1k - T1j; | |
T1R = T3 - T6; | |
T7 = T3 + T6; | |
T1x = TL + TM; | |
TN = TL - TM; | |
TC = Ty + TB; | |
T25 = Ty - TB; | |
To = v7.x; | |
T19 = v7.y; | |
} | |
T1d = v3.y; | |
Tq = v3.x; | |
T1c = Tn - To; | |
Tp = Tn + To; | |
T20 = T18 + T19; | |
T1a = T18 - T19; | |
Tr = v11.x; | |
T1e = v11.y; | |
} | |
} | |
{ | |
real_t TG, Tb, TP, Ta, TO, TF, Tc, TH; | |
{ | |
real_t TD, T8, T9, TE; | |
TD = v2.y; | |
T8 = v2.x; | |
{ | |
real_t T17, Ts, T21, T1f; | |
T17 = Tq - Tr; | |
Ts = Tq + Tr; | |
T21 = T1d + T1e; | |
T1f = T1d - T1e; | |
T1E = T1a - T17; | |
T1b = T17 + T1a; | |
T1Z = Tp - Ts; | |
Tt = Tp + Ts; | |
T2h = T20 + T21; | |
T22 = T20 - T21; | |
T1D = T1c + T1f; | |
T1g = T1c - T1f; | |
T9 = v10.x; | |
TE = v10.y; | |
} | |
TG = v14.y; | |
Tb = v14.x; | |
TP = T8 - T9; | |
Ta = T8 + T9; | |
TO = TD - TE; | |
TF = TD + TE; | |
Tc = v6.x; | |
TH = v6.y; | |
} | |
{ | |
real_t TR, Td, TS, TI; | |
T1n = TP + TO; | |
TQ = TO - TP; | |
TR = Tb - Tc; | |
Td = Tb + Tc; | |
TS = TG - TH; | |
TI = TG + TH; | |
Te = Ta + Td; | |
T26 = Td - Ta; | |
TT = TR + TS; | |
T1m = TR - TS; | |
TJ = TF + TI; | |
T1S = TF - TI; | |
} | |
} | |
{ | |
real_t TX, Tg, Th, TY; | |
TX = v1.y; | |
Tg = v1.x; | |
Th = v9.x; | |
TY = v9.y; | |
T12 = v5.y; | |
Tj = v5.x; | |
T11 = Tg - Th; | |
Ti = Tg + Th; | |
T1V = TX + TY; | |
TZ = TX - TY; | |
Tk = v13.x; | |
T13 = v13.y; | |
} | |
} | |
{ | |
real_t T2f, T1B, T10, T1U, T1X, T1A, T15, Tv, TK, T2i; | |
{ | |
real_t Tf, Tu, T2j, T2k, T2g; | |
T2f = T7 - Te; | |
Tf = T7 + Te; | |
{ | |
real_t TW, Tl, T1W, T14, Tm; | |
TW = Tj - Tk; | |
Tl = Tj + Tk; | |
T1W = T12 + T13; | |
T14 = T12 - T13; | |
T1B = TZ - TW; | |
T10 = TW + TZ; | |
T1U = Ti - Tl; | |
Tm = Ti + Tl; | |
T2g = T1V + T1W; | |
T1X = T1V - T1W; | |
T1A = T11 + T14; | |
T15 = T11 - T14; | |
Tu = Tm + Tt; | |
Tv = Tt - Tm; | |
} | |
TK = TC - TJ; | |
T2j = TC + TJ; | |
T2k = T2g + T2h; | |
T2i = T2g - T2h; | |
v0.x = Tf + Tu; | |
v0.y = T2j + T2k; | |
v8.y = T2j - T2k; | |
v8.x = Tf - Tu; | |
} | |
{ | |
real_t T29, T1T, T27, T2d, T2a, T2b, T28, T24, T1Y, | |
T23; | |
T29 = T1R - T1S; | |
T1T = T1R + T1S; | |
v4.y = Tv + TK; | |
v4.x = T2f + T2i; | |
v12.x = T2f - T2i; | |
v12.y = TK - Tv; | |
T27 = T25 - T26; | |
T2d = T26 + T25; | |
T2a = T1X - T1U; | |
T1Y = T1U + T1X; | |
T23 = T1Z - T22; | |
T2b = T1Z + T22; | |
T28 = T23 - T1Y; | |
T24 = T1Y + T23; | |
{ | |
real_t T1I, TV, T1v, T1y, T1t, T1s, T1r, T1p, T1q, | |
T1i; | |
{ | |
real_t T1o, T2e, T2c, TU, T16, T1h; | |
T1I = TQ + TT; | |
TU = TQ - TT; | |
T2e = T2a + T2b; | |
T2c = T2a - T2b; | |
TV = fma (KP707106781, TU, TN); | |
T1v = fma (-KP707106781, TU, TN); | |
v10.x = fma (-KP707106781, T24, T1T); | |
v10.y = fma (-KP707106781, T2e, T2d); | |
v2.y = fma (KP707106781, T2e, T2d); | |
v2.x = fma (KP707106781, T24, T1T); | |
v6.y = fma (KP707106781, T28, T27); | |
v6.x = fma (KP707106781, T2c, T29); | |
v14.x = fma (-KP707106781, T2c, T29); | |
v14.y = fma (-KP707106781, T28, T27); | |
T1o = T1m - T1n; | |
T1y = T1n + T1m; | |
T1t = fma (-KP414213562, T10, T15); | |
T16 = fma (KP414213562, T15, T10); | |
T1h = fma (-KP414213562, T1g, T1b); | |
T1s = fma (KP414213562, T1b, T1g); | |
T1r = fma (KP707106781, T1o, T1l); | |
T1p = fma (-KP707106781, T1o, T1l); | |
T1q = T16 + T1h; | |
T1i = T16 - T1h; | |
} | |
{ | |
real_t T1w, T1u, T1C, T1F; | |
T1w = T1t + T1s; | |
T1u = T1s - T1t; | |
T1z = fma (KP707106781, T1y, T1x); | |
T1L = fma (-KP707106781, T1y, T1x); | |
v15.y = fma (KP923879532, T1q, T1p); | |
v15.x = fma (KP923879532, T1w, T1v); | |
v7.x = fma (-KP923879532, T1w, T1v); | |
v7.y = fma (-KP923879532, T1q, T1p); | |
v3.x = fma (KP923879532, T1i, TV); | |
v3.y = fma (KP923879532, T1u, T1r); | |
v11.y = fma (-KP923879532, T1u, T1r); | |
v11.x = fma (-KP923879532, T1i, TV); | |
T1M = fma (-KP414213562, T1A, T1B); | |
T1C = fma (KP414213562, T1B, T1A); | |
T1F = fma (-KP414213562, T1E, T1D); | |
T1N = fma (KP414213562, T1D, T1E); | |
T1P = fma (KP707106781, T1I, T1H); | |
T1J = fma (-KP707106781, T1I, T1H); | |
T1K = T1F - T1C; | |
T1G = T1C + T1F; | |
} | |
} | |
} | |
} | |
} | |
T1O = T1M - T1N; | |
T1Q = T1M + T1N; | |
v1.x = fma (KP923879532, T1G, T1z); | |
v1.y = fma (KP923879532, T1Q, T1P); | |
v9.y = fma (-KP923879532, T1Q, T1P); | |
v9.x = fma (-KP923879532, T1G, T1z); | |
v5.y = fma (KP923879532, T1K, T1J); | |
v5.x = fma (KP923879532, T1O, T1L); | |
v13.x = fma (-KP923879532, T1O, T1L); | |
v13.y = fma (-KP923879532, T1K, T1J); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
*u8 = v8; | |
*u9 = v9; | |
*u10 = v10; | |
*u11 = v11; | |
*u12 = v12; | |
*u13 = v13; | |
*u14 = v14; | |
*u15 = v15; | |
} | |
kernel void radix | |
( | |
global const float2 * x, | |
global float2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 16; | |
x += i + batch_offset; | |
float2 v0 = x[0 * threads]; | |
float2 v1 = x[1 * threads]; | |
float2 v2 = x[2 * threads]; | |
float2 v3 = x[3 * threads]; | |
float2 v4 = x[4 * threads]; | |
float2 v5 = x[5 * threads]; | |
float2 v6 = x[6 * threads]; | |
float2 v7 = x[7 * threads]; | |
float2 v8 = x[8 * threads]; | |
float2 v9 = x[9 * threads]; | |
float2 v10 = x[10 * threads]; | |
float2 v11 = x[11 * threads]; | |
float2 v12 = x[12 * threads]; | |
float2 v13 = x[13 * threads]; | |
float2 v14 = x[14 * threads]; | |
float2 v15 = x[15 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((float)-0.3926990926265717 * k / p)); | |
v2 = mul(v2, twiddle((float)-0.7853981852531433 * k / p)); | |
v3 = mul(v3, twiddle((float)-1.178097248077393 * k / p)); | |
v4 = mul(v4, twiddle((float)-1.570796370506287 * k / p)); | |
v5 = mul(v5, twiddle((float)-1.963495492935181 * k / p)); | |
v6 = mul(v6, twiddle((float)-2.356194496154785 * k / p)); | |
v7 = mul(v7, twiddle((float)-2.748893737792969 * k / p)); | |
v8 = mul(v8, twiddle((float)-3.141592741012573 * k / p)); | |
v9 = mul(v9, twiddle((float)-3.534291744232178 * k / p)); | |
v10 = mul(v10, twiddle((float)-3.926990985870361 * k / p)); | |
v11 = mul(v11, twiddle((float)-4.319690227508545 * k / p)); | |
v12 = mul(v12, twiddle((float)-4.71238899230957 * k / p)); | |
v13 = mul(v13, twiddle((float)-5.105088233947754 * k / p)); | |
v14 = mul(v14, twiddle((float)-5.497787475585938 * k / p)); | |
v15 = mul(v15, twiddle((float)-5.890486240386963 * k / p)); | |
} | |
dft16(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15); | |
const size_t j = k + (i - k) * 16; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
y[8 * p] = v8; | |
y[9 * p] = v9; | |
y[10 * p] = v10; | |
y[11 * p] = v11; | |
y[12 * p] = v12; | |
y[13 * p] = v13; | |
y[14 * p] = v14; | |
y[15 * p] = v15; | |
} | |
#define DEVICE | |
typedef float real_t; | |
typedef float2 real2_t; | |
float2 mul | |
( | |
float2 a, | |
float2 b | |
) | |
{ | |
float2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
float2 twiddle | |
( | |
float alpha | |
) | |
{ | |
float2 r = {native_cos(alpha), native_sin(alpha)}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t TB, T3, Tn, Ti, To, T6, TC, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
TB = T1 - T2; | |
T3 = T1 + T2; | |
Tn = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
To = T4 - T5; | |
T6 = T4 + T5; | |
TC = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv + Ty; | |
TH = Ty - Tv; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TP, TG, Tu, Tf, Tm, TQ; | |
{ | |
real_t T7, Te, TL, TO; | |
TP = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tq + Tt; | |
Tu = Tq - Tt; | |
Te = Ta + Td; | |
Tf = Ta - Td; | |
Tm = Ti - Tl; | |
TL = Ti + Tl; | |
TO = TM + TN; | |
TQ = TN - TM; | |
v0.x = T7 + Te; | |
v0.y = TL + TO; | |
v4.y = TL - TO; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = To + Tn; | |
Tp = Tn - To; | |
v2.y = Tf + Tm; | |
v2.x = TP + TQ; | |
v6.x = TP - TQ; | |
v6.y = Tm - Tf; | |
TA = Tu - Tz; | |
TE = Tu + Tz; | |
TD = TB - TC; | |
TJ = TB + TC; | |
TK = TH - TG; | |
TI = TG + TH; | |
v3.y = fma (KP707106781, TA, Tp); | |
v3.x = fma (KP707106781, TK, TJ); | |
v7.x = fma (-KP707106781, TK, TJ); | |
v7.y = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v1.x = fma (KP707106781, TE, TD); | |
v1.y = fma (KP707106781, TI, TF); | |
v5.y = fma (-KP707106781, TI, TF); | |
v5.x = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const float2 * x, | |
global float2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
float2 v0 = x[0 * threads]; | |
float2 v1 = x[1 * threads]; | |
float2 v2 = x[2 * threads]; | |
float2 v3 = x[3 * threads]; | |
float2 v4 = x[4 * threads]; | |
float2 v5 = x[5 * threads]; | |
float2 v6 = x[6 * threads]; | |
float2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p)); | |
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p)); | |
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p)); | |
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p)); | |
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p)); | |
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p)); | |
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
typedef float real_t; | |
typedef float2 real2_t; | |
float2 mul | |
( | |
float2 a, | |
float2 b | |
) | |
{ | |
float2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
float2 twiddle | |
( | |
float alpha | |
) | |
{ | |
float2 r = {native_cos(alpha), native_sin(alpha)}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t TB, T3, Tn, Ti, To, T6, TC, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
TB = T1 - T2; | |
T3 = T1 + T2; | |
Tn = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
To = T4 - T5; | |
T6 = T4 + T5; | |
TC = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv + Ty; | |
TH = Ty - Tv; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TP, TG, Tu, Tf, Tm, TQ; | |
{ | |
real_t T7, Te, TL, TO; | |
TP = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tq + Tt; | |
Tu = Tq - Tt; | |
Te = Ta + Td; | |
Tf = Ta - Td; | |
Tm = Ti - Tl; | |
TL = Ti + Tl; | |
TO = TM + TN; | |
TQ = TN - TM; | |
v0.x = T7 + Te; | |
v0.y = TL + TO; | |
v4.y = TL - TO; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = To + Tn; | |
Tp = Tn - To; | |
v2.y = Tf + Tm; | |
v2.x = TP + TQ; | |
v6.x = TP - TQ; | |
v6.y = Tm - Tf; | |
TA = Tu - Tz; | |
TE = Tu + Tz; | |
TD = TB - TC; | |
TJ = TB + TC; | |
TK = TH - TG; | |
TI = TG + TH; | |
v3.y = fma (KP707106781, TA, Tp); | |
v3.x = fma (KP707106781, TK, TJ); | |
v7.x = fma (-KP707106781, TK, TJ); | |
v7.y = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v1.x = fma (KP707106781, TE, TD); | |
v1.y = fma (KP707106781, TI, TF); | |
v5.y = fma (-KP707106781, TI, TF); | |
v5.x = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const float2 * x, | |
global float2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
float2 v0 = x[0 * threads]; | |
float2 v1 = x[1 * threads]; | |
float2 v2 = x[2 * threads]; | |
float2 v3 = x[3 * threads]; | |
float2 v4 = x[4 * threads]; | |
float2 v5 = x[5 * threads]; | |
float2 v6 = x[6 * threads]; | |
float2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((float)-0.7853981852531433 * k / p)); | |
v2 = mul(v2, twiddle((float)-1.570796370506287 * k / p)); | |
v3 = mul(v3, twiddle((float)-2.356194496154785 * k / p)); | |
v4 = mul(v4, twiddle((float)-3.141592741012573 * k / p)); | |
v5 = mul(v5, twiddle((float)-3.926990985870361 * k / p)); | |
v6 = mul(v6, twiddle((float)-4.71238899230957 * k / p)); | |
v7 = mul(v7, twiddle((float)-5.497787475585938 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
typedef float real_t; | |
typedef float2 real2_t; | |
float2 mul | |
( | |
float2 a, | |
float2 b | |
) | |
{ | |
float2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
float2 twiddle | |
( | |
float alpha | |
) | |
{ | |
float2 r = {native_cos(alpha), native_sin(alpha)}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 16 -name dft16 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 144 FP additions, 40 FP multiplications, | |
* (or, 104 additions, 0 multiplications, 40 fused multiply/add), | |
* 97 stack variables, 3 constants, and 64 memory accesses | |
*/ | |
DEVICE void | |
dft16 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9, | |
real2_t * u10, real2_t * u11, real2_t * u12, real2_t * u13, | |
real2_t * u14, real2_t * u15) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
real2_t v8 = *u8; | |
real2_t v9 = *u9; | |
real2_t v10 = *u10; | |
real2_t v11 = *u11; | |
real2_t v12 = *u12; | |
real2_t v13 = *u13; | |
real2_t v14 = *u14; | |
real2_t v15 = *u15; | |
{ | |
const real_t KP923879532 = | |
+0.923879532511286756128183189396788286822416626; | |
const real_t KP414213562 = | |
+0.414213562373095048801688724209698078569671875; | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t T1z, T1P, T1N, T1M, T1L, T1J, T1K, T1G, T1O, T1Q; | |
{ | |
real_t T1H, TN, T25, T7, T1l, T1x, TC, T1R, T1A, T1b, T1Z, Tt, | |
T2h, T22, T1B; | |
real_t T1g, T1m, TQ, Te, T1S, T26, TJ, TT, T1n, T12, Tj, T11, | |
Ti, T1V, TZ; | |
real_t Tk, T13; | |
{ | |
real_t T1d, Tq, T1c, Tp, T20, T1a, Tr, T1e; | |
{ | |
real_t Tz, T4, T1j, T3, TM, Ty, T5, TA; | |
{ | |
real_t Tw, T1, T2, Tx; | |
Tw = v0.y; | |
T1 = v0.x; | |
T2 = v8.x; | |
Tx = v8.y; | |
Tz = v4.y; | |
T4 = v4.x; | |
T1j = T1 - T2; | |
T3 = T1 + T2; | |
TM = Tw - Tx; | |
Ty = Tw + Tx; | |
T5 = v12.x; | |
TA = v12.y; | |
} | |
{ | |
real_t T18, Tn, To, T19; | |
T18 = v15.y; | |
Tn = v15.x; | |
{ | |
real_t TL, T6, T1k, TB; | |
TL = T4 - T5; | |
T6 = T4 + T5; | |
T1k = Tz - TA; | |
TB = Tz + TA; | |
T1H = TM - TL; | |
TN = TL + TM; | |
T25 = T3 - T6; | |
T7 = T3 + T6; | |
T1l = T1j - T1k; | |
T1x = T1j + T1k; | |
TC = Ty + TB; | |
T1R = Ty - TB; | |
To = v7.x; | |
T19 = v7.y; | |
} | |
T1d = v3.y; | |
Tq = v3.x; | |
T1c = Tn - To; | |
Tp = Tn + To; | |
T20 = T18 + T19; | |
T1a = T18 - T19; | |
Tr = v11.x; | |
T1e = v11.y; | |
} | |
} | |
{ | |
real_t TG, Tb, TO, Ta, TP, TF, Tc, TH; | |
{ | |
real_t TD, T8, T9, TE; | |
TD = v2.y; | |
T8 = v2.x; | |
{ | |
real_t T17, Ts, T21, T1f; | |
T17 = Tq - Tr; | |
Ts = Tq + Tr; | |
T21 = T1d + T1e; | |
T1f = T1d - T1e; | |
T1A = T1a - T17; | |
T1b = T17 + T1a; | |
T1Z = Tp - Ts; | |
Tt = Tp + Ts; | |
T2h = T20 + T21; | |
T22 = T20 - T21; | |
T1B = T1c + T1f; | |
T1g = T1c - T1f; | |
T9 = v10.x; | |
TE = v10.y; | |
} | |
TG = v14.y; | |
Tb = v14.x; | |
TO = T8 - T9; | |
Ta = T8 + T9; | |
TP = TD - TE; | |
TF = TD + TE; | |
Tc = v6.x; | |
TH = v6.y; | |
} | |
{ | |
real_t TS, Td, TR, TI; | |
T1m = TO - TP; | |
TQ = TO + TP; | |
TS = Tb - Tc; | |
Td = Tb + Tc; | |
TR = TG - TH; | |
TI = TG + TH; | |
Te = Ta + Td; | |
T1S = Ta - Td; | |
T26 = TI - TF; | |
TJ = TF + TI; | |
TT = TR - TS; | |
T1n = TS + TR; | |
} | |
} | |
{ | |
real_t TX, Tg, Th, TY; | |
TX = v1.y; | |
Tg = v1.x; | |
Th = v9.x; | |
TY = v9.y; | |
T12 = v5.y; | |
Tj = v5.x; | |
T11 = Tg - Th; | |
Ti = Tg + Th; | |
T1V = TX + TY; | |
TZ = TX - TY; | |
Tk = v13.x; | |
T13 = v13.y; | |
} | |
} | |
{ | |
real_t T2j, T1D, T10, T1U, T1X, T1E, T15, Tv, TK, T2k; | |
{ | |
real_t Tf, Tu, T2f, T2i, T2g; | |
T2j = T7 - Te; | |
Tf = T7 + Te; | |
{ | |
real_t TW, Tl, T1W, T14, Tm; | |
TW = Tj - Tk; | |
Tl = Tj + Tk; | |
T1W = T12 + T13; | |
T14 = T12 - T13; | |
T1D = TZ - TW; | |
T10 = TW + TZ; | |
T1U = Ti - Tl; | |
Tm = Ti + Tl; | |
T2g = T1V + T1W; | |
T1X = T1V - T1W; | |
T1E = T11 + T14; | |
T15 = T11 - T14; | |
Tu = Tm + Tt; | |
Tv = Tm - Tt; | |
} | |
TK = TC - TJ; | |
T2f = TC + TJ; | |
T2i = T2g + T2h; | |
T2k = T2h - T2g; | |
v0.x = Tf + Tu; | |
v0.y = T2f + T2i; | |
v8.y = T2f - T2i; | |
v8.x = Tf - Tu; | |
} | |
{ | |
real_t T29, T1T, T27, T2d, T2a, T2b, T28, T24, T1Y, | |
T23; | |
T29 = T1S + T1R; | |
T1T = T1R - T1S; | |
v4.y = Tv + TK; | |
v4.x = T2j + T2k; | |
v12.x = T2j - T2k; | |
v12.y = TK - Tv; | |
T27 = T25 + T26; | |
T2d = T25 - T26; | |
T2a = T1U + T1X; | |
T1Y = T1U - T1X; | |
T23 = T1Z + T22; | |
T2b = T22 - T1Z; | |
T28 = T1Y + T23; | |
T24 = T1Y - T23; | |
{ | |
real_t T1y, TV, T1r, T1I, T1s, T1t, T1v, T1p, T1q, | |
T1i; | |
{ | |
real_t T1o, T2e, T2c, TU, T16, T1h; | |
T1y = TT - TQ; | |
TU = TQ + TT; | |
T2e = T2b - T2a; | |
T2c = T2a + T2b; | |
TV = fma (KP707106781, TU, TN); | |
T1r = fma (-KP707106781, TU, TN); | |
v14.y = fma (-KP707106781, T24, T1T); | |
v14.x = fma (-KP707106781, T2e, T2d); | |
v6.x = fma (KP707106781, T2e, T2d); | |
v6.y = fma (KP707106781, T24, T1T); | |
v2.x = fma (KP707106781, T28, T27); | |
v2.y = fma (KP707106781, T2c, T29); | |
v10.y = fma (-KP707106781, T2c, T29); | |
v10.x = fma (-KP707106781, T28, T27); | |
T1o = T1m + T1n; | |
T1I = T1m - T1n; | |
T1s = fma (-KP414213562, T10, T15); | |
T16 = fma (KP414213562, T15, T10); | |
T1h = fma (-KP414213562, T1g, T1b); | |
T1t = fma (KP414213562, T1b, T1g); | |
T1v = fma (KP707106781, T1o, T1l); | |
T1p = fma (-KP707106781, T1o, T1l); | |
T1q = T1h - T16; | |
T1i = T16 + T1h; | |
} | |
{ | |
real_t T1w, T1u, T1C, T1F; | |
T1w = T1s + T1t; | |
T1u = T1s - T1t; | |
T1z = fma (KP707106781, T1y, T1x); | |
T1P = fma (-KP707106781, T1y, T1x); | |
v1.y = fma (KP923879532, T1i, TV); | |
v1.x = fma (KP923879532, T1w, T1v); | |
v9.x = fma (-KP923879532, T1w, T1v); | |
v9.y = fma (-KP923879532, T1i, TV); | |
v5.x = fma (KP923879532, T1q, T1p); | |
v5.y = fma (KP923879532, T1u, T1r); | |
v13.y = fma (-KP923879532, T1u, T1r); | |
v13.x = fma (-KP923879532, T1q, T1p); | |
T1N = fma (-KP414213562, T1A, T1B); | |
T1C = fma (KP414213562, T1B, T1A); | |
T1F = fma (-KP414213562, T1E, T1D); | |
T1M = fma (KP414213562, T1D, T1E); | |
T1L = fma (KP707106781, T1I, T1H); | |
T1J = fma (-KP707106781, T1I, T1H); | |
T1K = T1F + T1C; | |
T1G = T1C - T1F; | |
} | |
} | |
} | |
} | |
} | |
T1O = T1M - T1N; | |
T1Q = T1M + T1N; | |
v15.y = fma (KP923879532, T1K, T1J); | |
v15.x = fma (KP923879532, T1Q, T1P); | |
v7.x = fma (-KP923879532, T1Q, T1P); | |
v7.y = fma (-KP923879532, T1K, T1J); | |
v3.x = fma (KP923879532, T1G, T1z); | |
v3.y = fma (KP923879532, T1O, T1L); | |
v11.y = fma (-KP923879532, T1O, T1L); | |
v11.x = fma (-KP923879532, T1G, T1z); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
*u8 = v8; | |
*u9 = v9; | |
*u10 = v10; | |
*u11 = v11; | |
*u12 = v12; | |
*u13 = v13; | |
*u14 = v14; | |
*u15 = v15; | |
} | |
kernel void radix | |
( | |
global const float2 * x, | |
global float2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 16; | |
x += i + batch_offset; | |
float2 v0 = x[0 * threads]; | |
float2 v1 = x[1 * threads]; | |
float2 v2 = x[2 * threads]; | |
float2 v3 = x[3 * threads]; | |
float2 v4 = x[4 * threads]; | |
float2 v5 = x[5 * threads]; | |
float2 v6 = x[6 * threads]; | |
float2 v7 = x[7 * threads]; | |
float2 v8 = x[8 * threads]; | |
float2 v9 = x[9 * threads]; | |
float2 v10 = x[10 * threads]; | |
float2 v11 = x[11 * threads]; | |
float2 v12 = x[12 * threads]; | |
float2 v13 = x[13 * threads]; | |
float2 v14 = x[14 * threads]; | |
float2 v15 = x[15 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((float)-0.3926990926265717 * k / p)); | |
v2 = mul(v2, twiddle((float)-0.7853981852531433 * k / p)); | |
v3 = mul(v3, twiddle((float)-1.178097248077393 * k / p)); | |
v4 = mul(v4, twiddle((float)-1.570796370506287 * k / p)); | |
v5 = mul(v5, twiddle((float)-1.963495492935181 * k / p)); | |
v6 = mul(v6, twiddle((float)-2.356194496154785 * k / p)); | |
v7 = mul(v7, twiddle((float)-2.748893737792969 * k / p)); | |
v8 = mul(v8, twiddle((float)-3.141592741012573 * k / p)); | |
v9 = mul(v9, twiddle((float)-3.534291744232178 * k / p)); | |
v10 = mul(v10, twiddle((float)-3.926990985870361 * k / p)); | |
v11 = mul(v11, twiddle((float)-4.319690227508545 * k / p)); | |
v12 = mul(v12, twiddle((float)-4.71238899230957 * k / p)); | |
v13 = mul(v13, twiddle((float)-5.105088233947754 * k / p)); | |
v14 = mul(v14, twiddle((float)-5.497787475585938 * k / p)); | |
v15 = mul(v15, twiddle((float)-5.890486240386963 * k / p)); | |
} | |
dft16(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15); | |
const size_t j = k + (i - k) * 16; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
y[8 * p] = v8; | |
y[9 * p] = v9; | |
y[10 * p] = v10; | |
y[11 * p] = v11; | |
y[12 * p] = v12; | |
y[13 * p] = v13; | |
y[14 * p] = v14; | |
y[15 * p] = v15; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
float2 r2c | |
( | |
float v | |
) | |
{ | |
float2 r = {v, 0}; return r; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float2 * prm_1, | |
global float * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = r2c( prm_2[idx] ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
float2 scl | |
( | |
float2 v, | |
float s | |
) | |
{ | |
v.x *= s; v.y *= s; return v; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float2 * prm_1, | |
global float2 * prm_2, | |
float prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = scl( prm_2[idx], prm_3 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float2 * prm_1, | |
global float2 * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2[idx]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
float c2r | |
( | |
float2 v | |
) | |
{ | |
return v.x; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
float prm_2, | |
global float2 * prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = ( prm_2 * c2r( prm_3[idx] ) ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
float SUM_float | |
( | |
float prm1, | |
float prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global float * prm_1, | |
global float * prm_2, | |
float prm_3, | |
global float * g_odata | |
) | |
{ | |
float mySum = (float)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_float(mySum, pow( ( prm_1[idx] - prm_2[idx] ), prm_3 )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
FFT(C2C) size=3160 batch=1 | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (-KP951056516, Tk, Tj); | |
v2.y = fma (KP951056516, Tw, Tv); | |
v3.y = fma (-KP951056516, Tw, Tv); | |
v3.x = fma (KP951056516, Tk, Tj); | |
v4.x = fma (-KP951056516, Ti, Tb); | |
v4.y = fma (KP951056516, Tu, Tr); | |
v1.y = fma (-KP951056516, Tu, Tr); | |
v1.x = fma (KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
} | |
kernel void bluestein_twiddle | |
( | |
ulong n, | |
global double2 * output | |
) | |
{ | |
const size_t x = get_global_id(0); | |
const size_t xx = ((ulong)x * x) % (2 * n); | |
if (x < n) output[x] = twiddle(-3.141592653589793 * xx / n); | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 conj | |
( | |
double2 v | |
) | |
{ | |
double2 r = {v.x, -v.y}; | |
return r; | |
} | |
kernel void bluestein_pad_kernel | |
( | |
global const double2 * input, | |
global double2 * output, | |
uint n, | |
uint m | |
) | |
{ | |
const uint x = get_global_id(0); | |
if (x < m) | |
{ | |
if(x < n || m - x < n) | |
{ | |
output[x] = conj(input[min(x, m - x)]); | |
} | |
else | |
{ | |
double2 r = {0,0}; | |
output[x] = r; | |
} | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td; | |
{ | |
real_t T7, T1, T2, T8; | |
T7 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T8 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T7 + T8; | |
T9 = T7 - T8; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, Ta, Te, Tg; | |
T6 = T4 + T5; | |
Ta = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta + T9; | |
v3.x = Tb - Te; | |
v1.x = Tb + Te; | |
v1.y = T9 - Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (-KP951056516, Tk, Tj); | |
v2.y = fma (KP951056516, Tw, Tv); | |
v3.y = fma (-KP951056516, Tw, Tv); | |
v3.x = fma (KP951056516, Tk, Tj); | |
v4.x = fma (-KP951056516, Ti, Tb); | |
v4.y = fma (KP951056516, Tu, Tr); | |
v1.y = fma (-KP951056516, Tu, Tr); | |
v1.x = fma (KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
} | |
kernel void bluestein_mul_in | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
uint radix, | |
uint p, | |
uint out_stride | |
) | |
{ | |
const size_t thread = get_global_id(0); | |
const size_t threads = get_global_size(0); | |
const size_t batch = get_global_id(1); | |
const size_t element = get_global_id(2); | |
if(element < out_stride) | |
{ | |
const size_t in_off = thread + batch * radix * threads + element * threads; | |
const size_t out_off = thread * out_stride + batch * out_stride * threads + element; | |
if(element < radix) | |
{ | |
double2 w = exp[element]; | |
if(p != 1) | |
{ | |
ulong a = (ulong)element * (thread % p); | |
ulong b = (ulong)radix * p; | |
double2 t = twiddle(-6.283185307179586 * (a % (2 * b)) / b); | |
w = mul(w, t); | |
} | |
output[out_off] = mul(data[in_off], w); | |
} | |
else | |
{ | |
double2 r = {0,0}; | |
output[out_off] = r; | |
} | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td; | |
{ | |
real_t T7, T1, T2, T8; | |
T7 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T8 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T7 + T8; | |
T9 = T7 - T8; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, Ta, Te, Tg; | |
T6 = T4 + T5; | |
Ta = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta + T9; | |
v3.x = Tb - Te; | |
v1.x = Tb + Te; | |
v1.y = T9 - Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (-KP951056516, Tk, Tj); | |
v2.y = fma (KP951056516, Tw, Tv); | |
v3.y = fma (-KP951056516, Tw, Tv); | |
v3.x = fma (KP951056516, Tk, Tj); | |
v4.x = fma (-KP951056516, Ti, Tb); | |
v4.y = fma (KP951056516, Tu, Tr); | |
v1.y = fma (-KP951056516, Tu, Tr); | |
v1.x = fma (KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
kernel void bluestein_mul | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
uint stride | |
) | |
{ | |
const size_t x = get_global_id(0); | |
const size_t y = get_global_id(1); | |
if(x < stride) | |
{ | |
const size_t off = x + stride * y; | |
output[off] = mul(data[off], exp[x]); | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, Ta, T5, Td; | |
{ | |
real_t T8, T1, T2, T9; | |
T8 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T9 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T8 + T9; | |
Ta = T8 - T9; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, T7, Te, Tg; | |
T6 = T4 + T5; | |
T7 = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta - T7; | |
v3.x = Tb + Te; | |
v1.x = Tb - Te; | |
v1.y = T7 + Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t TB, T3, Tn, Ti, To, T6, TC, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
TB = T1 - T2; | |
T3 = T1 + T2; | |
Tn = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
To = T4 - T5; | |
T6 = T4 + T5; | |
TC = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv + Ty; | |
TH = Ty - Tv; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TP, TG, Tu, Tf, Tm, TQ; | |
{ | |
real_t T7, Te, TL, TO; | |
TP = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tq + Tt; | |
Tu = Tq - Tt; | |
Te = Ta + Td; | |
Tf = Ta - Td; | |
Tm = Ti - Tl; | |
TL = Ti + Tl; | |
TO = TM + TN; | |
TQ = TN - TM; | |
v0.x = T7 + Te; | |
v0.y = TL + TO; | |
v4.y = TL - TO; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = To + Tn; | |
Tp = Tn - To; | |
v2.y = Tf + Tm; | |
v2.x = TP + TQ; | |
v6.x = TP - TQ; | |
v6.y = Tm - Tf; | |
TA = Tu - Tz; | |
TE = Tu + Tz; | |
TD = TB - TC; | |
TJ = TB + TC; | |
TK = TH - TG; | |
TI = TG + TH; | |
v3.y = fma (KP707106781, TA, Tp); | |
v3.x = fma (KP707106781, TK, TJ); | |
v7.x = fma (-KP707106781, TK, TJ); | |
v7.y = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v1.x = fma (KP707106781, TE, TD); | |
v1.y = fma (KP707106781, TI, TF); | |
v5.y = fma (-KP707106781, TI, TF); | |
v5.x = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (KP951056516, Tk, Tj); | |
v2.y = fma (-KP951056516, Tw, Tv); | |
v3.y = fma (KP951056516, Tw, Tv); | |
v3.x = fma (-KP951056516, Tk, Tj); | |
v4.x = fma (KP951056516, Ti, Tb); | |
v4.y = fma (-KP951056516, Tu, Tr); | |
v1.y = fma (KP951056516, Tu, Tr); | |
v1.x = fma (-KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 scale | |
( | |
double2 x, | |
double a | |
) | |
{ | |
double2 r = {x.x * a, x.y * a}; | |
return r; | |
} | |
kernel void bluestein_mul_out | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
double div, | |
uint p, | |
uint in_stride, | |
uint radix | |
) | |
{ | |
const size_t i = get_global_id(0); | |
const size_t threads = get_global_size(0); | |
const size_t b = get_global_id(1); | |
const size_t l = get_global_id(2); | |
if(l < radix) | |
{ | |
const size_t k = i % p; | |
const size_t j = k + (i - k) * radix; | |
const size_t in_off = i * in_stride + b * in_stride * threads + l; | |
const size_t out_off = j + b * threads * radix + l * p; | |
output[out_off] = mul(scale(data[in_off], div), exp[l]); | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t TB, T3, Tn, Ti, To, T6, TC, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
TB = T1 - T2; | |
T3 = T1 + T2; | |
Tn = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
To = T4 - T5; | |
T6 = T4 + T5; | |
TC = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv + Ty; | |
TH = Ty - Tv; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TP, TG, Tu, Tf, Tm, TQ; | |
{ | |
real_t T7, Te, TL, TO; | |
TP = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tq + Tt; | |
Tu = Tq - Tt; | |
Te = Ta + Td; | |
Tf = Ta - Td; | |
Tm = Ti - Tl; | |
TL = Ti + Tl; | |
TO = TM + TN; | |
TQ = TN - TM; | |
v0.x = T7 + Te; | |
v0.y = TL + TO; | |
v4.y = TL - TO; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = To + Tn; | |
Tp = Tn - To; | |
v2.y = Tf + Tm; | |
v2.x = TP + TQ; | |
v6.x = TP - TQ; | |
v6.y = Tm - Tf; | |
TA = Tu - Tz; | |
TE = Tu + Tz; | |
TD = TB - TC; | |
TJ = TB + TC; | |
TK = TH - TG; | |
TI = TG + TH; | |
v3.y = fma (KP707106781, TA, Tp); | |
v3.x = fma (KP707106781, TK, TJ); | |
v7.x = fma (-KP707106781, TK, TJ); | |
v7.y = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v1.x = fma (KP707106781, TE, TD); | |
v1.y = fma (KP707106781, TI, TF); | |
v5.y = fma (-KP707106781, TI, TF); | |
v5.x = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (KP951056516, Tk, Tj); | |
v2.y = fma (-KP951056516, Tw, Tv); | |
v3.y = fma (KP951056516, Tw, Tv); | |
v3.x = fma (-KP951056516, Tk, Tj); | |
v4.x = fma (KP951056516, Ti, Tb); | |
v4.y = fma (-KP951056516, Tu, Tr); | |
v1.y = fma (KP951056516, Tu, Tr); | |
v1.x = fma (-KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
} | |
kernel void bluestein_twiddle | |
( | |
ulong n, | |
global double2 * output | |
) | |
{ | |
const size_t x = get_global_id(0); | |
const size_t xx = ((ulong)x * x) % (2 * n); | |
if (x < n) output[x] = twiddle(3.141592653589793 * xx / n); | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 conj | |
( | |
double2 v | |
) | |
{ | |
double2 r = {v.x, -v.y}; | |
return r; | |
} | |
kernel void bluestein_pad_kernel | |
( | |
global const double2 * input, | |
global double2 * output, | |
uint n, | |
uint m | |
) | |
{ | |
const uint x = get_global_id(0); | |
if (x < m) | |
{ | |
if(x < n || m - x < n) | |
{ | |
output[x] = conj(input[min(x, m - x)]); | |
} | |
else | |
{ | |
double2 r = {0,0}; | |
output[x] = r; | |
} | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td; | |
{ | |
real_t T7, T1, T2, T8; | |
T7 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T8 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T7 + T8; | |
T9 = T7 - T8; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, Ta, Te, Tg; | |
T6 = T4 + T5; | |
Ta = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta + T9; | |
v3.x = Tb - Te; | |
v1.x = Tb + Te; | |
v1.y = T9 - Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (-KP951056516, Tk, Tj); | |
v2.y = fma (KP951056516, Tw, Tv); | |
v3.y = fma (-KP951056516, Tw, Tv); | |
v3.x = fma (KP951056516, Tk, Tj); | |
v4.x = fma (-KP951056516, Ti, Tb); | |
v4.y = fma (KP951056516, Tu, Tr); | |
v1.y = fma (-KP951056516, Tu, Tr); | |
v1.x = fma (KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
} | |
kernel void bluestein_mul_in | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
uint radix, | |
uint p, | |
uint out_stride | |
) | |
{ | |
const size_t thread = get_global_id(0); | |
const size_t threads = get_global_size(0); | |
const size_t batch = get_global_id(1); | |
const size_t element = get_global_id(2); | |
if(element < out_stride) | |
{ | |
const size_t in_off = thread + batch * radix * threads + element * threads; | |
const size_t out_off = thread * out_stride + batch * out_stride * threads + element; | |
if(element < radix) | |
{ | |
double2 w = exp[element]; | |
if(p != 1) | |
{ | |
ulong a = (ulong)element * (thread % p); | |
ulong b = (ulong)radix * p; | |
double2 t = twiddle(6.283185307179586 * (a % (2 * b)) / b); | |
w = mul(w, t); | |
} | |
output[out_off] = mul(data[in_off], w); | |
} | |
else | |
{ | |
double2 r = {0,0}; | |
output[out_off] = r; | |
} | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td; | |
{ | |
real_t T7, T1, T2, T8; | |
T7 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T8 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T7 + T8; | |
T9 = T7 - T8; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, Ta, Te, Tg; | |
T6 = T4 + T5; | |
Ta = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta + T9; | |
v3.x = Tb - Te; | |
v1.x = Tb + Te; | |
v1.y = T9 - Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (-KP951056516, Tk, Tj); | |
v2.y = fma (KP951056516, Tw, Tv); | |
v3.y = fma (-KP951056516, Tw, Tv); | |
v3.x = fma (KP951056516, Tk, Tj); | |
v4.x = fma (-KP951056516, Ti, Tb); | |
v4.y = fma (KP951056516, Tu, Tr); | |
v1.y = fma (-KP951056516, Tu, Tr); | |
v1.x = fma (KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
kernel void bluestein_mul | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
uint stride | |
) | |
{ | |
const size_t x = get_global_id(0); | |
const size_t y = get_global_id(1); | |
if(x < stride) | |
{ | |
const size_t off = x + stride * y; | |
output[off] = mul(data[off], exp[x]); | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, Ta, T5, Td; | |
{ | |
real_t T8, T1, T2, T9; | |
T8 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T9 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T8 + T9; | |
Ta = T8 - T9; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, T7, Te, Tg; | |
T6 = T4 + T5; | |
T7 = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta - T7; | |
v3.x = Tb + Te; | |
v1.x = Tb - Te; | |
v1.y = T7 + Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t TB, T3, Tn, Ti, To, T6, TC, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
TB = T1 - T2; | |
T3 = T1 + T2; | |
Tn = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
To = T4 - T5; | |
T6 = T4 + T5; | |
TC = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv + Ty; | |
TH = Ty - Tv; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TP, TG, Tu, Tf, Tm, TQ; | |
{ | |
real_t T7, Te, TL, TO; | |
TP = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tq + Tt; | |
Tu = Tq - Tt; | |
Te = Ta + Td; | |
Tf = Ta - Td; | |
Tm = Ti - Tl; | |
TL = Ti + Tl; | |
TO = TM + TN; | |
TQ = TN - TM; | |
v0.x = T7 + Te; | |
v0.y = TL + TO; | |
v4.y = TL - TO; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = To + Tn; | |
Tp = Tn - To; | |
v2.y = Tf + Tm; | |
v2.x = TP + TQ; | |
v6.x = TP - TQ; | |
v6.y = Tm - Tf; | |
TA = Tu - Tz; | |
TE = Tu + Tz; | |
TD = TB - TC; | |
TJ = TB + TC; | |
TK = TH - TG; | |
TI = TG + TH; | |
v3.y = fma (KP707106781, TA, Tp); | |
v3.x = fma (KP707106781, TK, TJ); | |
v7.x = fma (-KP707106781, TK, TJ); | |
v7.y = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v1.x = fma (KP707106781, TE, TD); | |
v1.y = fma (KP707106781, TI, TF); | |
v5.y = fma (-KP707106781, TI, TF); | |
v5.x = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (KP951056516, Tk, Tj); | |
v2.y = fma (-KP951056516, Tw, Tv); | |
v3.y = fma (KP951056516, Tw, Tv); | |
v3.x = fma (-KP951056516, Tk, Tj); | |
v4.x = fma (KP951056516, Ti, Tb); | |
v4.y = fma (-KP951056516, Tu, Tr); | |
v1.y = fma (KP951056516, Tu, Tr); | |
v1.x = fma (-KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 scale | |
( | |
double2 x, | |
double a | |
) | |
{ | |
double2 r = {x.x * a, x.y * a}; | |
return r; | |
} | |
kernel void bluestein_mul_out | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
double div, | |
uint p, | |
uint in_stride, | |
uint radix | |
) | |
{ | |
const size_t i = get_global_id(0); | |
const size_t threads = get_global_size(0); | |
const size_t b = get_global_id(1); | |
const size_t l = get_global_id(2); | |
if(l < radix) | |
{ | |
const size_t k = i % p; | |
const size_t j = k + (i - k) * radix; | |
const size_t in_off = i * in_stride + b * in_stride * threads + l; | |
const size_t out_off = j + b * threads * radix + l * p; | |
output[out_off] = mul(scale(data[in_off], div), exp[l]); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double2 * prm_1, | |
global double2 * prm_2 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = prm_2[idx]; | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double2 scl | |
( | |
double2 v, | |
double s | |
) | |
{ | |
v.x *= s; v.y *= s; return v; | |
} | |
kernel void vexcl_vector_kernel | |
( | |
ulong n, | |
global double2 * prm_1, | |
global double2 * prm_2, | |
double prm_3 | |
) | |
{ | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
prm_1[idx] = scl( prm_2[idx], prm_3 ); | |
} | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double SUM_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
double dot2 | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
return a.x * b.x + a.y * b.y; | |
} | |
double2 minus | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x - b.x, a.y - b.y}; return r; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double2 * prm_1, | |
global double2 * prm_2, | |
global double2 * prm_3, | |
global double2 * prm_4, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_double(mySum, dot2( minus( prm_1[idx], prm_2[idx] ), minus( prm_3[idx], prm_4[idx] ) )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
double SUM_double | |
( | |
double prm1, | |
double prm2 | |
) | |
{ | |
return prm1 + prm2; | |
} | |
double dot2 | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
return a.x * b.x + a.y * b.y; | |
} | |
kernel void vexcl_reductor_kernel | |
( | |
ulong n, | |
global double2 * prm_1, | |
global double2 * prm_2, | |
global double * g_odata | |
) | |
{ | |
double mySum = (double)0; | |
ulong chunk_size = (n + get_global_size(0) - 1) / get_global_size(0); | |
ulong chunk_start = get_global_id(0) * chunk_size; | |
ulong chunk_end = chunk_start + chunk_size; | |
if (n < chunk_end) chunk_end = n; | |
for(ulong idx = chunk_start; idx < chunk_end; ++idx) | |
{ | |
mySum = SUM_double(mySum, dot2( prm_1[idx], prm_2[idx] )); | |
} | |
g_odata[get_group_id(0)] = mySum; | |
} | |
/tmp/vexcl/tests/fft.cpp:94: error in "test_dimensions": absolute value of rms(back, inp){1.4190491545037371} exceeds 1e-08 | |
FFT(C2C) size=47 batch=39 | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
} | |
kernel void bluestein_twiddle | |
( | |
ulong n, | |
global double2 * output | |
) | |
{ | |
const size_t x = get_global_id(0); | |
const size_t xx = ((ulong)x * x) % (2 * n); | |
if (x < n) output[x] = twiddle(-3.141592653589793 * xx / n); | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 conj | |
( | |
double2 v | |
) | |
{ | |
double2 r = {v.x, -v.y}; | |
return r; | |
} | |
kernel void bluestein_pad_kernel | |
( | |
global const double2 * input, | |
global double2 * output, | |
uint n, | |
uint m | |
) | |
{ | |
const uint x = get_global_id(0); | |
if (x < m) | |
{ | |
if(x < n || m - x < n) | |
{ | |
output[x] = conj(input[min(x, m - x)]); | |
} | |
else | |
{ | |
double2 r = {0,0}; | |
output[x] = r; | |
} | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td; | |
{ | |
real_t T7, T1, T2, T8; | |
T7 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T8 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T7 + T8; | |
T9 = T7 - T8; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, Ta, Te, Tg; | |
T6 = T4 + T5; | |
Ta = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta + T9; | |
v3.x = Tb - Te; | |
v1.x = Tb + Te; | |
v1.y = T9 - Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T9, T1, T6, T2, T3, T7; | |
T9 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Tc, T8, Ta, T5, Tb; | |
T4 = T2 + T3; | |
Tc = T3 - T2; | |
T8 = T6 - T7; | |
Ta = T6 + T7; | |
T5 = fma (-KP500000000, T4, T1); | |
Tb = fma (-KP500000000, Ta, T9); | |
v0.y = T9 + Ta; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, T8, T5); | |
v2.y = fma (-KP866025403, Tc, Tb); | |
v1.y = fma (KP866025403, Tc, Tb); | |
v1.x = fma (KP866025403, T8, T5); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
} | |
kernel void bluestein_mul_in | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
uint radix, | |
uint p, | |
uint out_stride | |
) | |
{ | |
const size_t thread = get_global_id(0); | |
const size_t threads = get_global_size(0); | |
const size_t batch = get_global_id(1); | |
const size_t element = get_global_id(2); | |
if(element < out_stride) | |
{ | |
const size_t in_off = thread + batch * radix * threads + element * threads; | |
const size_t out_off = thread * out_stride + batch * out_stride * threads + element; | |
if(element < radix) | |
{ | |
double2 w = exp[element]; | |
if(p != 1) | |
{ | |
ulong a = (ulong)element * (thread % p); | |
ulong b = (ulong)radix * p; | |
double2 t = twiddle(-6.283185307179586 * (a % (2 * b)) / b); | |
w = mul(w, t); | |
} | |
output[out_off] = mul(data[in_off], w); | |
} | |
else | |
{ | |
double2 r = {0,0}; | |
output[out_off] = r; | |
} | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td; | |
{ | |
real_t T7, T1, T2, T8; | |
T7 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T8 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T7 + T8; | |
T9 = T7 - T8; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, Ta, Te, Tg; | |
T6 = T4 + T5; | |
Ta = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta + T9; | |
v3.x = Tb - Te; | |
v1.x = Tb + Te; | |
v1.y = T9 - Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T9, T1, T6, T2, T3, T7; | |
T9 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Tc, T8, Ta, T5, Tb; | |
T4 = T2 + T3; | |
Tc = T3 - T2; | |
T8 = T6 - T7; | |
Ta = T6 + T7; | |
T5 = fma (-KP500000000, T4, T1); | |
Tb = fma (-KP500000000, Ta, T9); | |
v0.y = T9 + Ta; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, T8, T5); | |
v2.y = fma (-KP866025403, Tc, Tb); | |
v1.y = fma (KP866025403, Tc, Tb); | |
v1.x = fma (KP866025403, T8, T5); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
kernel void bluestein_mul | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
uint stride | |
) | |
{ | |
const size_t x = get_global_id(0); | |
const size_t y = get_global_id(1); | |
if(x < stride) | |
{ | |
const size_t off = x + stride * y; | |
output[off] = mul(data[off], exp[x]); | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, Ta, T5, Td; | |
{ | |
real_t T8, T1, T2, T9; | |
T8 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T9 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T8 + T9; | |
Ta = T8 - T9; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, T7, Te, Tg; | |
T6 = T4 + T5; | |
T7 = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta - T7; | |
v3.x = Tb + Te; | |
v1.x = Tb - Te; | |
v1.y = T7 + Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t TB, T3, Tn, Ti, To, T6, TC, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
TB = T1 - T2; | |
T3 = T1 + T2; | |
Tn = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
To = T4 - T5; | |
T6 = T4 + T5; | |
TC = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv + Ty; | |
TH = Ty - Tv; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TP, TG, Tu, Tf, Tm, TQ; | |
{ | |
real_t T7, Te, TL, TO; | |
TP = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tq + Tt; | |
Tu = Tq - Tt; | |
Te = Ta + Td; | |
Tf = Ta - Td; | |
Tm = Ti - Tl; | |
TL = Ti + Tl; | |
TO = TM + TN; | |
TQ = TN - TM; | |
v0.x = T7 + Te; | |
v0.y = TL + TO; | |
v4.y = TL - TO; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = To + Tn; | |
Tp = Tn - To; | |
v2.y = Tf + Tm; | |
v2.x = TP + TQ; | |
v6.x = TP - TQ; | |
v6.y = Tm - Tf; | |
TA = Tu - Tz; | |
TE = Tu + Tz; | |
TD = TB - TC; | |
TJ = TB + TC; | |
TK = TH - TG; | |
TI = TG + TH; | |
v3.y = fma (KP707106781, TA, Tp); | |
v3.x = fma (KP707106781, TK, TJ); | |
v7.x = fma (-KP707106781, TK, TJ); | |
v7.y = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v1.x = fma (KP707106781, TE, TD); | |
v1.y = fma (KP707106781, TI, TF); | |
v5.y = fma (-KP707106781, TI, TF); | |
v5.x = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T5, T1, T6, T2, T3, T7; | |
T5 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Ta, T8, Tc, Tb, T9; | |
T4 = T2 + T3; | |
Ta = T2 - T3; | |
T8 = T6 + T7; | |
Tc = T7 - T6; | |
Tb = fma (-KP500000000, T4, T1); | |
T9 = fma (-KP500000000, T8, T5); | |
v0.y = T5 + T8; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, Tc, Tb); | |
v2.y = fma (-KP866025403, Ta, T9); | |
v1.y = fma (KP866025403, Ta, T9); | |
v1.x = fma (KP866025403, Tc, Tb); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 scale | |
( | |
double2 x, | |
double a | |
) | |
{ | |
double2 r = {x.x * a, x.y * a}; | |
return r; | |
} | |
kernel void bluestein_mul_out | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
double div, | |
uint p, | |
uint in_stride, | |
uint radix | |
) | |
{ | |
const size_t i = get_global_id(0); | |
const size_t threads = get_global_size(0); | |
const size_t b = get_global_id(1); | |
const size_t l = get_global_id(2); | |
if(l < radix) | |
{ | |
const size_t k = i % p; | |
const size_t j = k + (i - k) * radix; | |
const size_t in_off = i * in_stride + b * in_stride * threads + l; | |
const size_t out_off = j + b * threads * radix + l * p; | |
output[out_off] = mul(scale(data[in_off], div), exp[l]); | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
} | |
kernel void bluestein_twiddle | |
( | |
ulong n, | |
global double2 * output | |
) | |
{ | |
const size_t x = get_global_id(0); | |
const size_t xx = ((ulong)x * x) % (2 * n); | |
if (x < n) output[x] = twiddle(3.141592653589793 * xx / n); | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 conj | |
( | |
double2 v | |
) | |
{ | |
double2 r = {v.x, -v.y}; | |
return r; | |
} | |
kernel void bluestein_pad_kernel | |
( | |
global const double2 * input, | |
global double2 * output, | |
uint n, | |
uint m | |
) | |
{ | |
const uint x = get_global_id(0); | |
if (x < m) | |
{ | |
if(x < n || m - x < n) | |
{ | |
output[x] = conj(input[min(x, m - x)]); | |
} | |
else | |
{ | |
double2 r = {0,0}; | |
output[x] = r; | |
} | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td; | |
{ | |
real_t T7, T1, T2, T8; | |
T7 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T8 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T7 + T8; | |
T9 = T7 - T8; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, Ta, Te, Tg; | |
T6 = T4 + T5; | |
Ta = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta + T9; | |
v3.x = Tb - Te; | |
v1.x = Tb + Te; | |
v1.y = T9 - Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T9, T1, T6, T2, T3, T7; | |
T9 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Tc, T8, Ta, T5, Tb; | |
T4 = T2 + T3; | |
Tc = T3 - T2; | |
T8 = T6 - T7; | |
Ta = T6 + T7; | |
T5 = fma (-KP500000000, T4, T1); | |
Tb = fma (-KP500000000, Ta, T9); | |
v0.y = T9 + Ta; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, T8, T5); | |
v2.y = fma (-KP866025403, Tc, Tb); | |
v1.y = fma (KP866025403, Tc, Tb); | |
v1.x = fma (KP866025403, T8, T5); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
} | |
kernel void bluestein_mul_in | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
uint radix, | |
uint p, | |
uint out_stride | |
) | |
{ | |
const size_t thread = get_global_id(0); | |
const size_t threads = get_global_size(0); | |
const size_t batch = get_global_id(1); | |
const size_t element = get_global_id(2); | |
if(element < out_stride) | |
{ | |
const size_t in_off = thread + batch * radix * threads + element * threads; | |
const size_t out_off = thread * out_stride + batch * out_stride * threads + element; | |
if(element < radix) | |
{ | |
double2 w = exp[element]; | |
if(p != 1) | |
{ | |
ulong a = (ulong)element * (thread % p); | |
ulong b = (ulong)radix * p; | |
double2 t = twiddle(6.283185307179586 * (a % (2 * b)) / b); | |
w = mul(w, t); | |
} | |
output[out_off] = mul(data[in_off], w); | |
} | |
else | |
{ | |
double2 r = {0,0}; | |
output[out_off] = r; | |
} | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td; | |
{ | |
real_t T7, T1, T2, T8; | |
T7 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T8 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T7 + T8; | |
T9 = T7 - T8; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, Ta, Te, Tg; | |
T6 = T4 + T5; | |
Ta = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta + T9; | |
v3.x = Tb - Te; | |
v1.x = Tb + Te; | |
v1.y = T9 - Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
Tn = T1 - T2; | |
T3 = T1 + T2; | |
TC = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
TB = T4 - T5; | |
T6 = T4 + T5; | |
To = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv - Ty; | |
TH = Tv + Ty; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TL, TG, Tu, Tf, Tm, TO; | |
{ | |
real_t T7, Te, TP, TQ; | |
TL = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tt - Tq; | |
Tu = Tq + Tt; | |
Te = Ta + Td; | |
Tf = Td - Ta; | |
Tm = Ti - Tl; | |
TP = Ti + Tl; | |
TQ = TM + TN; | |
TO = TM - TN; | |
v0.x = T7 + Te; | |
v0.y = TP + TQ; | |
v4.y = TP - TQ; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = Tn - To; | |
Tp = Tn + To; | |
v2.y = Tf + Tm; | |
v2.x = TL + TO; | |
v6.x = TL - TO; | |
v6.y = Tm - Tf; | |
TA = Tu + Tz; | |
TE = Tz - Tu; | |
TD = TB + TC; | |
TJ = TC - TB; | |
TK = TG + TH; | |
TI = TG - TH; | |
v1.x = fma (KP707106781, TA, Tp); | |
v1.y = fma (KP707106781, TK, TJ); | |
v5.y = fma (-KP707106781, TK, TJ); | |
v5.x = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v3.y = fma (KP707106781, TE, TD); | |
v3.x = fma (KP707106781, TI, TF); | |
v7.x = fma (-KP707106781, TI, TF); | |
v7.y = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T9, T1, T6, T2, T3, T7; | |
T9 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Tc, T8, Ta, T5, Tb; | |
T4 = T2 + T3; | |
Tc = T3 - T2; | |
T8 = T6 - T7; | |
Ta = T6 + T7; | |
T5 = fma (-KP500000000, T4, T1); | |
Tb = fma (-KP500000000, Ta, T9); | |
v0.y = T9 + Ta; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, T8, T5); | |
v2.y = fma (-KP866025403, Tc, Tb); | |
v1.y = fma (KP866025403, Tc, Tb); | |
v1.x = fma (KP866025403, T8, T5); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
kernel void bluestein_mul | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
uint stride | |
) | |
{ | |
const size_t x = get_global_id(0); | |
const size_t y = get_global_id(1); | |
if(x < stride) | |
{ | |
const size_t off = x + stride * y; | |
output[off] = mul(data[off], exp[x]); | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, Ta, T5, Td; | |
{ | |
real_t T8, T1, T2, T9; | |
T8 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T9 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T8 + T9; | |
Ta = T8 - T9; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, T7, Te, Tg; | |
T6 = T4 + T5; | |
T7 = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta - T7; | |
v3.x = Tb + Te; | |
v1.x = Tb - Te; | |
v1.y = T7 + Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 8 -name dft8 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 52 FP additions, 8 FP multiplications, | |
* (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
* 43 stack variables, 1 constants, and 32 memory accesses | |
*/ | |
DEVICE void | |
dft8 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
{ | |
const real_t KP707106781 = | |
+0.707106781186547524400844362104849039284835938; | |
{ | |
real_t TF, TE, TD, TI; | |
{ | |
real_t TB, T3, Tn, Ti, To, T6, TC, Tl, Td, TN, Tz, TH, Ta, Tq, | |
Tt; | |
real_t TM; | |
{ | |
real_t Tj, T4, T5, Tk; | |
{ | |
real_t Tg, T1, T2, Th; | |
Tg = v0.y; | |
T1 = v0.x; | |
T2 = v4.x; | |
Th = v4.y; | |
Tj = v2.y; | |
T4 = v2.x; | |
TB = T1 - T2; | |
T3 = T1 + T2; | |
Tn = Tg - Th; | |
Ti = Tg + Th; | |
T5 = v6.x; | |
Tk = v6.y; | |
} | |
{ | |
real_t Tw, Tb, Tc, Tx; | |
Tw = v7.y; | |
Tb = v7.x; | |
To = T4 - T5; | |
T6 = T4 + T5; | |
TC = Tj - Tk; | |
Tl = Tj + Tk; | |
Tc = v3.x; | |
Tx = v3.y; | |
{ | |
real_t Tr, T8, Tv, Ty, T9, Ts; | |
Tr = v1.y; | |
T8 = v1.x; | |
Td = Tb + Tc; | |
Tv = Tb - Tc; | |
TN = Tw + Tx; | |
Ty = Tw - Tx; | |
T9 = v5.x; | |
Ts = v5.y; | |
Tz = Tv + Ty; | |
TH = Ty - Tv; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
Tt = Tr - Ts; | |
TM = Tr + Ts; | |
} | |
} | |
} | |
{ | |
real_t TP, TG, Tu, Tf, Tm, TQ; | |
{ | |
real_t T7, Te, TL, TO; | |
TP = T3 - T6; | |
T7 = T3 + T6; | |
TG = Tq + Tt; | |
Tu = Tq - Tt; | |
Te = Ta + Td; | |
Tf = Ta - Td; | |
Tm = Ti - Tl; | |
TL = Ti + Tl; | |
TO = TM + TN; | |
TQ = TN - TM; | |
v0.x = T7 + Te; | |
v0.y = TL + TO; | |
v4.y = TL - TO; | |
v4.x = T7 - Te; | |
} | |
{ | |
real_t Tp, TA, TJ, TK; | |
TF = To + Tn; | |
Tp = Tn - To; | |
v2.y = Tf + Tm; | |
v2.x = TP + TQ; | |
v6.x = TP - TQ; | |
v6.y = Tm - Tf; | |
TA = Tu - Tz; | |
TE = Tu + Tz; | |
TD = TB - TC; | |
TJ = TB + TC; | |
TK = TH - TG; | |
TI = TG + TH; | |
v3.y = fma (KP707106781, TA, Tp); | |
v3.x = fma (KP707106781, TK, TJ); | |
v7.x = fma (-KP707106781, TK, TJ); | |
v7.y = fma (-KP707106781, TA, Tp); | |
} | |
} | |
} | |
v1.x = fma (KP707106781, TE, TD); | |
v1.y = fma (KP707106781, TI, TF); | |
v5.y = fma (-KP707106781, TI, TF); | |
v5.x = fma (-KP707106781, TE, TD); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 8; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.7853981633974483 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.570796326794897 * k / p)); | |
v3 = mul(v3, twiddle((double)-2.356194490192345 * k / p)); | |
v4 = mul(v4, twiddle((double)-3.141592653589793 * k / p)); | |
v5 = mul(v5, twiddle((double)-3.926990816987241 * k / p)); | |
v6 = mul(v6, twiddle((double)-4.71238898038469 * k / p)); | |
v7 = mul(v7, twiddle((double)-5.497787143782138 * k / p)); | |
} | |
dft8(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7); | |
const size_t j = k + (i - k) * 8; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T5, T1, T6, T2, T3, T7; | |
T5 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Ta, T8, Tc, Tb, T9; | |
T4 = T2 + T3; | |
Ta = T2 - T3; | |
T8 = T6 + T7; | |
Tc = T7 - T6; | |
Tb = fma (-KP500000000, T4, T1); | |
T9 = fma (-KP500000000, T8, T5); | |
v0.y = T5 + T8; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, Tc, Tb); | |
v2.y = fma (-KP866025403, Ta, T9); | |
v1.y = fma (KP866025403, Ta, T9); | |
v1.x = fma (KP866025403, Tc, Tb); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 scale | |
( | |
double2 x, | |
double a | |
) | |
{ | |
double2 r = {x.x * a, x.y * a}; | |
return r; | |
} | |
kernel void bluestein_mul_out | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
double div, | |
uint p, | |
uint in_stride, | |
uint radix | |
) | |
{ | |
const size_t i = get_global_id(0); | |
const size_t threads = get_global_size(0); | |
const size_t b = get_global_id(1); | |
const size_t l = get_global_id(2); | |
if(l < radix) | |
{ | |
const size_t k = i % p; | |
const size_t j = k + (i - k) * radix; | |
const size_t in_off = i * in_stride + b * in_stride * threads + l; | |
const size_t out_off = j + b * threads * radix + l * p; | |
output[out_off] = mul(scale(data[in_off], div), exp[l]); | |
} | |
} | |
/tmp/vexcl/tests/fft.cpp:94: error in "test_dimensions": absolute value of rms(back, inp){1.5464188795074079} exceeds 1e-08 | |
FFT(C2C) size=858 batch=2 | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 2 -name dft2 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 4 FP additions, 0 FP multiplications, | |
* (or, 4 additions, 0 multiplications, 0 fused multiply/add), | |
* 6 stack variables, 0 constants, and 8 memory accesses | |
*/ | |
DEVICE void | |
dft2 (real2_t * u0, real2_t * u1) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
{ | |
{ | |
real_t T3, T1, T2, T4; | |
T3 = v0.y; | |
T1 = v0.x; | |
T2 = v1.x; | |
T4 = v1.y; | |
v0.x = T1 + T2; | |
v0.y = T3 + T4; | |
v1.y = T3 - T4; | |
v1.x = T1 - T2; | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 2; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-3.141592653589793 * k / p)); | |
} | |
dft2(&v0, &v1); | |
const size_t j = k + (i - k) * 2; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T9, T1, T6, T2, T3, T7; | |
T9 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Tc, T8, Ta, T5, Tb; | |
T4 = T2 + T3; | |
Tc = T3 - T2; | |
T8 = T6 - T7; | |
Ta = T6 + T7; | |
T5 = fma (-KP500000000, T4, T1); | |
Tb = fma (-KP500000000, Ta, T9); | |
v0.y = T9 + Ta; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, T8, T5); | |
v2.y = fma (-KP866025403, Tc, Tb); | |
v1.y = fma (KP866025403, Tc, Tb); | |
v1.x = fma (KP866025403, T8, T5); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 11 -name dft11 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 140 FP additions, 110 FP multiplications, | |
* (or, 30 additions, 0 multiplications, 110 fused multiply/add), | |
* 96 stack variables, 10 constants, and 44 memory accesses | |
*/ | |
DEVICE void | |
dft11 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9, | |
real2_t * u10) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
real2_t v8 = *u8; | |
real2_t v9 = *u9; | |
real2_t v10 = *u10; | |
{ | |
const real_t KP989821441 = | |
+0.989821441880932732376092037776718787376519372; | |
const real_t KP959492973 = | |
+0.959492973614497389890368057066327699062454848; | |
const real_t KP918985947 = | |
+0.918985947228994779780736114132655398124909697; | |
const real_t KP876768831 = | |
+0.876768831002589333891339807079336796764054852; | |
const real_t KP830830026 = | |
+0.830830026003772851058548298459246407048009821; | |
const real_t KP778434453 = | |
+0.778434453334651800608337670740821884709317477; | |
const real_t KP715370323 = | |
+0.715370323453429719112414662767260662417897278; | |
const real_t KP634356270 = | |
+0.634356270682424498893150776899916060542806975; | |
const real_t KP342584725 = | |
+0.342584725681637509502641509861112333758894680; | |
const real_t KP521108558 = | |
+0.521108558113202722944698153526659300680427422; | |
{ | |
real_t T1, TA, T1p, T1y, T19, T1d, T1a, T1e; | |
{ | |
real_t T1f, T1u, T4, T1q, Tg, T1t, T7, T1s, Ta, Td, T1r, TP, | |
T1X, T26, Ti; | |
real_t TG, T1O, T1w, TY, T1F, T17, To, T1i, T1k, T1h, Tr, T1j, | |
Tu, T1g, Tx; | |
real_t T21, TU, TL, TC, T1S, T1J, T1m, T12, T1z, T1b; | |
T1f = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tv, Tw, Ty, Tz, Tp, Tq, Tm, Tn, Ts, Tt, T1E, T16, | |
Tb, Tc; | |
{ | |
real_t T2, T3, Te, Tf; | |
Tv = v1.y; | |
T2 = v1.x; | |
T3 = v10.x; | |
Tw = v10.y; | |
Ty = v5.y; | |
Te = v5.x; | |
Tf = v6.x; | |
Tz = v6.y; | |
{ | |
real_t T5, T6, T8, T9; | |
Tp = v2.y; | |
T5 = v2.x; | |
T1u = T3 - T2; | |
T4 = T2 + T3; | |
T1q = Tf - Te; | |
Tg = Te + Tf; | |
T6 = v9.x; | |
Tq = v9.y; | |
Tm = v3.y; | |
T8 = v3.x; | |
T9 = v8.x; | |
Tn = v8.y; | |
Ts = v4.y; | |
Tb = v4.x; | |
T1t = T6 - T5; | |
T7 = T5 + T6; | |
T1s = T9 - T8; | |
Ta = T8 + T9; | |
Tc = v7.x; | |
Tt = v7.y; | |
} | |
} | |
{ | |
real_t T25, Th, T1W, TO; | |
T25 = fma (KP521108558, T1q, T1u); | |
T1W = fma (KP521108558, T1s, T1q); | |
TO = fma (-KP342584725, T4, Ta); | |
Th = fma (-KP342584725, Ta, T7); | |
Td = Tb + Tc; | |
T1r = Tc - Tb; | |
TP = fma (-KP634356270, TO, Tg); | |
T1X = fma (-KP715370323, T1W, T1t); | |
T26 = fma (KP715370323, T25, T1r); | |
{ | |
real_t TF, T1N, T1v, TX; | |
TF = fma (-KP342584725, Td, T4); | |
Ti = fma (-KP634356270, Th, Td); | |
T1N = fma (-KP521108558, T1t, T1r); | |
T1v = fma (-KP521108558, T1u, T1t); | |
TG = fma (-KP634356270, TF, T7); | |
TX = fma (-KP342584725, T7, Tg); | |
T1O = fma (KP715370323, T1N, T1q); | |
T1w = fma (-KP715370323, T1v, T1s); | |
T1E = fma (KP521108558, T1r, T1s); | |
TY = fma (-KP634356270, TX, T4); | |
T16 = fma (-KP342584725, Tg, Td); | |
} | |
} | |
T1F = fma (KP715370323, T1E, T1u); | |
T17 = fma (-KP634356270, T16, Ta); | |
To = Tm - Tn; | |
T1i = Tm + Tn; | |
T1k = Ty + Tz; | |
TA = Ty - Tz; | |
T1h = Tp + Tq; | |
Tr = Tp - Tq; | |
T1j = Ts + Tt; | |
Tu = Ts - Tt; | |
{ | |
real_t TB, T1R, T20, TK, TT, T1I, T1l; | |
T20 = fma (-KP342584725, T1i, T1h); | |
TK = fma (KP521108558, To, TA); | |
TT = fma (-KP521108558, Tr, Tu); | |
T1g = Tv + Tw; | |
Tx = Tv - Tw; | |
T21 = fma (-KP634356270, T20, T1j); | |
TU = fma (KP715370323, TT, TA); | |
TL = fma (-KP715370323, TK, Tr); | |
TB = fma (KP521108558, TA, Tx); | |
T1R = fma (-KP342584725, T1j, T1g); | |
T1I = fma (-KP342584725, T1g, T1i); | |
T1l = fma (-KP342584725, T1k, T1j); | |
TC = fma (KP715370323, TB, Tu); | |
T1S = fma (-KP634356270, T1R, T1h); | |
T1J = fma (-KP634356270, T1I, T1k); | |
T1m = fma (-KP634356270, T1l, T1i); | |
T12 = fma (KP521108558, Tu, To); | |
T1z = fma (-KP342584725, T1h, T1k); | |
T1b = fma (-KP521108558, Tx, Tr); | |
} | |
} | |
{ | |
real_t T13, T1A, T1c, T1Z, T1V, TH, TM, Tj, TD; | |
T13 = fma (KP715370323, T12, Tx); | |
T1A = fma (-KP634356270, T1z, T1g); | |
T1c = fma (-KP715370323, T1b, To); | |
v0.y = T1f + T1g + T1h + T1i + T1j + T1k; | |
v0.x = T1 + T4 + T7 + Ta + Td + Tg; | |
Tj = fma (-KP778434453, Ti, T4); | |
TD = fma (KP830830026, TC, Tr); | |
{ | |
real_t TE, T23, T28, Tl, Tk, T22, T27; | |
T22 = fma (-KP778434453, T21, T1g); | |
T27 = fma (KP830830026, T26, T1t); | |
Tk = fma (-KP876768831, Tj, Tg); | |
TE = fma (KP918985947, TD, To); | |
T23 = fma (-KP876768831, T22, T1k); | |
T28 = fma (KP918985947, T27, T1s); | |
Tl = fma (-KP959492973, Tk, T1); | |
{ | |
real_t T1U, T1T, T24, T1Y; | |
T1T = fma (-KP778434453, T1S, T1k); | |
T24 = fma (-KP959492973, T23, T1f); | |
T1Y = fma (KP830830026, T1X, T1u); | |
T1U = fma (-KP876768831, T1T, T1i); | |
v10.y = fma (-KP989821441, T28, T24); | |
v10.x = fma (-KP989821441, TE, Tl); | |
v1.x = fma (KP989821441, TE, Tl); | |
v1.y = fma (KP989821441, T28, T24); | |
T1Z = fma (-KP918985947, T1Y, T1r); | |
T1V = fma (-KP959492973, T1U, T1f); | |
} | |
TH = fma (-KP778434453, TG, Tg); | |
TM = fma (KP830830026, TL, Tx); | |
} | |
{ | |
real_t TS, TW, T1M, TZ, T14, T1Q; | |
{ | |
real_t TN, TR, TV, TJ, TI, TQ, T1P, T1L, T1K; | |
TQ = fma (-KP778434453, TP, Td); | |
TI = fma (-KP876768831, TH, Ta); | |
TN = fma (-KP918985947, TM, Tu); | |
TR = fma (-KP876768831, TQ, T7); | |
TV = fma (-KP830830026, TU, To); | |
TJ = fma (-KP959492973, TI, T1); | |
T1K = fma (-KP778434453, T1J, T1j); | |
TS = fma (-KP959492973, TR, T1); | |
TW = fma (-KP918985947, TV, Tx); | |
v9.y = fma (KP989821441, T1Z, T1V); | |
v9.x = fma (KP989821441, TN, TJ); | |
v2.x = fma (-KP989821441, TN, TJ); | |
v2.y = fma (-KP989821441, T1Z, T1V); | |
T1L = fma (-KP876768831, T1K, T1h); | |
T1P = fma (-KP830830026, T1O, T1s); | |
T1M = fma (-KP959492973, T1L, T1f); | |
TZ = fma (-KP778434453, TY, Ta); | |
T14 = fma (-KP830830026, T13, TA); | |
T1Q = fma (-KP918985947, T1P, T1u); | |
} | |
{ | |
real_t T15, T11, T1C, T1G, T1B, T10; | |
T1B = fma (-KP778434453, T1A, T1i); | |
T10 = fma (-KP876768831, TZ, Td); | |
T15 = fma (KP918985947, T14, Tr); | |
v8.y = fma (-KP989821441, T1Q, T1M); | |
v8.x = fma (-KP989821441, TW, TS); | |
v3.x = fma (KP989821441, TW, TS); | |
v3.y = fma (KP989821441, T1Q, T1M); | |
T11 = fma (-KP959492973, T10, T1); | |
T1C = fma (-KP876768831, T1B, T1j); | |
T1G = fma (-KP830830026, T1F, T1q); | |
{ | |
real_t T1D, T1H, T1o, T1x, T1n, T18; | |
T1n = fma (-KP778434453, T1m, T1h); | |
T1D = fma (-KP959492973, T1C, T1f); | |
T1H = fma (KP918985947, T1G, T1t); | |
T1o = fma (-KP876768831, T1n, T1g); | |
T1x = fma (-KP830830026, T1w, T1r); | |
T18 = fma (-KP778434453, T17, T7); | |
v7.x = fma (KP989821441, T15, T11); | |
v7.y = fma (KP989821441, T1H, T1D); | |
v4.y = fma (-KP989821441, T1H, T1D); | |
v4.x = fma (-KP989821441, T15, T11); | |
T1p = fma (-KP959492973, T1o, T1f); | |
T1y = fma (-KP918985947, T1x, T1q); | |
T19 = fma (-KP876768831, T18, T4); | |
T1d = fma (-KP830830026, T1c, Tu); | |
} | |
} | |
} | |
} | |
} | |
T1a = fma (-KP959492973, T19, T1); | |
T1e = fma (-KP918985947, T1d, TA); | |
v5.y = fma (KP989821441, T1y, T1p); | |
v5.x = fma (KP989821441, T1e, T1a); | |
v6.x = fma (-KP989821441, T1e, T1a); | |
v6.y = fma (-KP989821441, T1y, T1p); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
*u8 = v8; | |
*u9 = v9; | |
*u10 = v10; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 11; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
double2 v8 = x[8 * threads]; | |
double2 v9 = x[9 * threads]; | |
double2 v10 = x[10 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.5711986642890533 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.142397328578107 * k / p)); | |
v3 = mul(v3, twiddle((double)-1.71359599286716 * k / p)); | |
v4 = mul(v4, twiddle((double)-2.284794657156213 * k / p)); | |
v5 = mul(v5, twiddle((double)-2.855993321445267 * k / p)); | |
v6 = mul(v6, twiddle((double)-3.42719198573432 * k / p)); | |
v7 = mul(v7, twiddle((double)-3.998390650023373 * k / p)); | |
v8 = mul(v8, twiddle((double)-4.569589314312426 * k / p)); | |
v9 = mul(v9, twiddle((double)-5.140787978601479 * k / p)); | |
v10 = mul(v10, twiddle((double)-5.711986642890533 * k / p)); | |
} | |
dft11(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10); | |
const size_t j = k + (i - k) * 11; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
y[8 * p] = v8; | |
y[9 * p] = v9; | |
y[10 * p] = v10; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 13 -name dft13 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 176 FP additions, 114 FP multiplications, | |
* (or, 62 additions, 0 multiplications, 114 fused multiply/add), | |
* 114 stack variables, 25 constants, and 52 memory accesses | |
*/ | |
DEVICE void | |
dft13 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9, | |
real2_t * u10, real2_t * u11, real2_t * u12) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
real2_t v8 = *u8; | |
real2_t v9 = *u9; | |
real2_t v10 = *u10; | |
real2_t v11 = *u11; | |
real2_t v12 = *u12; | |
{ | |
const real_t KP600477271 = | |
+0.600477271932665282925769253334763009352012849; | |
const real_t KP875502302 = | |
+0.875502302409147941146295545768755143177842006; | |
const real_t KP520028571 = | |
+0.520028571888864619117130500499232802493238139; | |
const real_t KP575140729 = | |
+0.575140729474003121368385547455453388461001608; | |
const real_t KP300462606 = | |
+0.300462606288665774426601772289207995520941381; | |
const real_t KP516520780 = | |
+0.516520780623489722840901288569017135705033622; | |
const real_t KP968287244 = | |
+0.968287244361984016049539446938120421179794516; | |
const real_t KP503537032 = | |
+0.503537032863766627246873853868466977093348562; | |
const real_t KP251768516 = | |
+0.251768516431883313623436926934233488546674281; | |
const real_t KP581704778 = | |
+0.581704778510515730456870384989698884939833902; | |
const real_t KP859542535 = | |
+0.859542535098774820163672132761689612766401925; | |
const real_t KP083333333 = | |
+0.083333333333333333333333333333333333333333333; | |
const real_t KP957805992 = | |
+0.957805992594665126462521754605754580515587217; | |
const real_t KP522026385 = | |
+0.522026385161275033714027226654165028300441940; | |
const real_t KP853480001 = | |
+0.853480001859823990758994934970528322872359049; | |
const real_t KP769338817 = | |
+0.769338817572980603471413688209101117038278899; | |
const real_t KP612264650 = | |
+0.612264650376756543746494474777125408779395514; | |
const real_t KP038632954 = | |
+0.038632954644348171955506895830342264440241080; | |
const real_t KP302775637 = | |
+0.302775637731994646559610633735247973125648287; | |
const real_t KP514918778 = | |
+0.514918778086315755491789696138117261566051239; | |
const real_t KP686558370 = | |
+0.686558370781754340655719594850823015421401653; | |
const real_t KP226109445 = | |
+0.226109445035782405468510155372505010481906348; | |
const real_t KP301479260 = | |
+0.301479260047709873958013540496673347309208464; | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T1C, T1A, T1t, T1B, T2B, T2H, T2I, T2G; | |
{ | |
real_t T1P, T1, T2n, T2o, To, TH, T2h, T2k, TE, TB, TF, Tw, | |
T2j, T2c, T1m; | |
real_t T1W, T1X, T1c, T19, T1j, T12, T1f, T21, T24, T27, T1U; | |
T1P = v0.y; | |
T1 = v0.x; | |
{ | |
real_t TK, TL, T16, TY, TZ, T13, TW, TV, TN, TO, TQ, TR, | |
T2b, Tv, Ts; | |
real_t T2a; | |
{ | |
real_t T2d, Tf, Tq, Ty, Tb, Tr, T6, Tx, Ti, Tt, Tu, | |
Tl; | |
{ | |
real_t T7, T8, T9, Td, Te; | |
TK = v8.y; | |
Td = v8.x; | |
Te = v5.x; | |
TL = v5.y; | |
T16 = v12.y; | |
T7 = v12.x; | |
T8 = v10.x; | |
TY = v10.y; | |
TZ = v4.y; | |
T9 = v4.x; | |
T2d = Td - Te; | |
Tf = Td + Te; | |
{ | |
real_t T2, Ta, T3, T4; | |
T2 = v1.x; | |
T13 = v1.y; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
TW = v3.y; | |
T3 = v3.x; | |
T4 = v9.x; | |
TV = v9.y; | |
{ | |
real_t Tg, T5, Th, Tj, Tk; | |
TN = v11.y; | |
Tg = v11.x; | |
Ty = fma (KP500000000, Ta, -(T7)); | |
Tb = T7 + Ta; | |
Tr = T4 - T3; | |
T5 = T3 + T4; | |
Th = v6.x; | |
TO = v6.y; | |
TQ = v7.y; | |
Tj = v7.x; | |
Tk = v2.x; | |
TR = v2.y; | |
T6 = T2 + T5; | |
Tx = fma (-KP500000000, T5, T2); | |
Ti = Tg + Th; | |
Tt = Tg - Th; | |
Tu = Tj - Tk; | |
Tl = Tj + Tk; | |
} | |
} | |
} | |
{ | |
real_t Tc, Tm, T2e, T2g; | |
Tc = T6 + Tb; | |
T2n = T6 - Tb; | |
T2b = Ti - Tl; | |
Tm = Ti + Tl; | |
T2e = Tt + Tu; | |
Tv = Tt - Tu; | |
Ts = Tq - Tr; | |
T2g = Tr + Tq; | |
{ | |
real_t Tz, TA, Tn, T2f; | |
Tz = Tx - Ty; | |
T2a = Tx + Ty; | |
TA = fma (-KP500000000, Tm, Tf); | |
Tn = Tf + Tm; | |
T2f = fma (-KP500000000, T2e, T2d); | |
T2o = T2d + T2e; | |
To = Tc + Tn; | |
TH = Tc - Tn; | |
T2h = fma (KP866025403, T2g, T2f); | |
T2k = fma (-KP866025403, T2g, T2f); | |
TE = Tz - TA; | |
TB = Tz + TA; | |
} | |
} | |
} | |
{ | |
real_t T1R, TM, T10, T18, T1l, TX, T1k, T15, TP, T1a, | |
T1b, TS, T17, T14; | |
TF = Ts - Tv; | |
Tw = Ts + Tv; | |
T2j = fma (-KP866025403, T2b, T2a); | |
T2c = fma (KP866025403, T2b, T2a); | |
T1R = TK + TL; | |
TM = TK - TL; | |
T17 = TY + TZ; | |
T10 = TY - TZ; | |
T18 = fma (KP500000000, T17, -(T16)); | |
T1l = T16 + T17; | |
TX = TV - TW; | |
T14 = TW + TV; | |
T1k = T13 + T14; | |
T15 = fma (-KP500000000, T14, T13); | |
TP = TN - TO; | |
T1a = TN + TO; | |
T1b = TQ + TR; | |
TS = TQ - TR; | |
{ | |
real_t T1Q, T11, TT, T1S; | |
T1Q = T1k + T1l; | |
T1m = T1k - T1l; | |
T11 = TX + T10; | |
T1W = T10 - TX; | |
T1X = TP - TS; | |
TT = TP + TS; | |
T1S = T1a + T1b; | |
T1c = T1a - T1b; | |
{ | |
real_t T1Z, TU, T1T, T20; | |
T19 = T15 + T18; | |
T1Z = T15 - T18; | |
T1j = TM + TT; | |
TU = fma (-KP500000000, TT, TM); | |
T1T = T1R + T1S; | |
T20 = fma (-KP500000000, T1S, T1R); | |
T12 = fma (KP866025403, T11, TU); | |
T1f = fma (-KP866025403, T11, TU); | |
T21 = T1Z + T20; | |
T24 = T1Z - T20; | |
T27 = T1Q - T1T; | |
T1U = T1Q + T1T; | |
} | |
} | |
} | |
} | |
{ | |
real_t T1g, T1d, T25, T1Y; | |
T1g = fma (-KP866025403, T1c, T19); | |
T1d = fma (KP866025403, T1c, T19); | |
T25 = T1W - T1X; | |
T1Y = T1W + T1X; | |
v0.y = T1P + T1U; | |
v0.x = T1 + To; | |
{ | |
real_t T1O, T1o, TJ, T1N, T1L, T1F, T1K, T1M; | |
{ | |
real_t TC, T1J, T1z, T1w, T1I, Tp, T1E, T1q, TI, | |
T1s; | |
{ | |
real_t TG, T1n, T1G, T1u, T1e, T1h, T1v, T1x, | |
T1y, T1H, T1i; | |
TC = fma (KP301479260, TB, Tw); | |
T1x = fma (-KP226109445, Tw, TB); | |
T1y = fma (KP686558370, TE, TF); | |
TG = fma (-KP514918778, TF, TE); | |
T1n = fma (-KP302775637, T1m, T1j); | |
T1G = fma (KP302775637, T1j, T1m); | |
T1u = fma (-KP038632954, T12, T1d); | |
T1e = fma (KP038632954, T1d, T12); | |
T1h = fma (KP612264650, T1g, T1f); | |
T1v = fma (-KP612264650, T1f, T1g); | |
T1J = fma (KP769338817, T1y, T1x); | |
T1z = fma (-KP769338817, T1y, T1x); | |
T1H = fma (-KP853480001, T1v, T1u); | |
T1w = fma (KP853480001, T1v, T1u); | |
T1I = fma (-KP522026385, T1H, T1G); | |
T1O = fma (KP957805992, T1G, T1H); | |
Tp = fma (-KP083333333, To, T1); | |
T1E = fma (KP853480001, T1h, T1e); | |
T1i = fma (-KP853480001, T1h, T1e); | |
T1q = fma (-KP859542535, TG, TH); | |
TI = fma (KP581704778, TH, TG); | |
T1o = fma (KP957805992, T1n, T1i); | |
T1s = fma (-KP522026385, T1i, T1n); | |
} | |
{ | |
real_t T1D, T1p, TD, T1r; | |
T1p = fma (-KP251768516, TC, Tp); | |
TD = fma (KP503537032, TC, Tp); | |
T1C = fma (-KP968287244, T1z, T1w); | |
T1A = fma (KP968287244, T1z, T1w); | |
TJ = fma (KP516520780, TI, TD); | |
T1N = fma (-KP516520780, TI, TD); | |
T1D = fma (-KP300462606, T1q, T1p); | |
T1r = fma (KP300462606, T1q, T1p); | |
T1t = fma (-KP575140729, T1s, T1r); | |
T1B = fma (KP575140729, T1s, T1r); | |
T1L = fma (-KP520028571, T1E, T1D); | |
T1F = fma (KP520028571, T1E, T1D); | |
T1K = fma (KP875502302, T1J, T1I); | |
T1M = fma (-KP875502302, T1J, T1I); | |
} | |
} | |
{ | |
real_t T22, T2F, T2N, T2K, T2w, T2A, T1V, T2C, | |
T28, T2y, T2M, T2q; | |
{ | |
real_t T26, T2v, T2p, T2i, T2s, T2t, T2l, T2D, | |
T2E, T2u, T2m; | |
T2D = fma (-KP226109445, T1Y, T21); | |
T22 = fma (KP301479260, T21, T1Y); | |
T26 = fma (-KP514918778, T25, T24); | |
T2E = fma (KP686558370, T24, T25); | |
T2v = fma (-KP302775637, T2n, T2o); | |
T2p = fma (KP302775637, T2o, T2n); | |
T2i = fma (-KP038632954, T2h, T2c); | |
T2s = fma (KP038632954, T2c, T2h); | |
T2t = fma (KP612264650, T2j, T2k); | |
T2l = fma (-KP612264650, T2k, T2j); | |
T2F = fma (-KP769338817, T2E, T2D); | |
T2N = fma (KP769338817, T2E, T2D); | |
T2K = fma (KP853480001, T2t, T2s); | |
T2u = fma (-KP853480001, T2t, T2s); | |
T2w = fma (KP957805992, T2v, T2u); | |
T2A = fma (-KP522026385, T2u, T2v); | |
T1V = fma (-KP083333333, T1U, T1P); | |
T2m = fma (-KP853480001, T2l, T2i); | |
T2C = fma (KP853480001, T2l, T2i); | |
T28 = fma (KP581704778, T27, T26); | |
T2y = fma (-KP859542535, T26, T27); | |
T2M = fma (-KP522026385, T2m, T2p); | |
T2q = fma (KP957805992, T2p, T2m); | |
} | |
{ | |
real_t T2O, T2Q, T2z, T2P, T2L; | |
{ | |
real_t T23, T2x, T2r, T29, T2J; | |
T23 = fma (KP503537032, T22, T1V); | |
T2x = fma (-KP251768516, T22, T1V); | |
T2O = fma (-KP875502302, T2N, T2M); | |
T2Q = fma (KP875502302, T2N, T2M); | |
T2r = fma (KP516520780, T28, T23); | |
T29 = fma (-KP516520780, T28, T23); | |
T2z = fma (KP300462606, T2y, T2x); | |
T2J = fma (-KP300462606, T2y, T2x); | |
v12.x = fma (KP600477271, T1o, TJ); | |
v12.y = fma (-KP600477271, T2w, T2r); | |
v1.y = fma (KP600477271, T2w, T2r); | |
v1.x = fma (-KP600477271, T1o, TJ); | |
v8.x = fma (-KP600477271, T1O, T1N); | |
v8.y = fma (KP600477271, T2q, T29); | |
v5.y = fma (-KP600477271, T2q, T29); | |
v5.x = fma (KP600477271, T1O, T1N); | |
T2P = fma (KP520028571, T2K, T2J); | |
T2L = fma (-KP520028571, T2K, T2J); | |
} | |
T2B = fma (KP575140729, T2A, T2z); | |
T2H = fma (-KP575140729, T2A, T2z); | |
v11.x = fma (-KP575140729, T1M, T1L); | |
v11.y = fma (KP575140729, T2Q, T2P); | |
v6.y = fma (-KP575140729, T2Q, T2P); | |
v6.x = fma (KP575140729, T1M, T1L); | |
v7.x = fma (-KP575140729, T1K, T1F); | |
v7.y = fma (KP575140729, T2O, T2L); | |
v2.y = fma (-KP575140729, T2O, T2L); | |
v2.x = fma (KP575140729, T1K, T1F); | |
T2I = fma (KP968287244, T2F, T2C); | |
T2G = fma (-KP968287244, T2F, T2C); | |
} | |
} | |
} | |
} | |
} | |
v10.x = fma (-KP520028571, T1C, T1B); | |
v10.y = fma (KP520028571, T2I, T2H); | |
v4.y = fma (-KP520028571, T2I, T2H); | |
v4.x = fma (KP520028571, T1C, T1B); | |
v9.x = fma (-KP520028571, T1A, T1t); | |
v9.y = fma (KP520028571, T2G, T2B); | |
v3.y = fma (-KP520028571, T2G, T2B); | |
v3.x = fma (KP520028571, T1A, T1t); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
*u8 = v8; | |
*u9 = v9; | |
*u10 = v10; | |
*u11 = v11; | |
*u12 = v12; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 13; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
double2 v8 = x[8 * threads]; | |
double2 v9 = x[9 * threads]; | |
double2 v10 = x[10 * threads]; | |
double2 v11 = x[11 * threads]; | |
double2 v12 = x[12 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.483321946706122 * k / p)); | |
v2 = mul(v2, twiddle((double)-0.966643893412244 * k / p)); | |
v3 = mul(v3, twiddle((double)-1.449965840118366 * k / p)); | |
v4 = mul(v4, twiddle((double)-1.933287786824488 * k / p)); | |
v5 = mul(v5, twiddle((double)-2.41660973353061 * k / p)); | |
v6 = mul(v6, twiddle((double)-2.899931680236732 * k / p)); | |
v7 = mul(v7, twiddle((double)-3.383253626942854 * k / p)); | |
v8 = mul(v8, twiddle((double)-3.866575573648976 * k / p)); | |
v9 = mul(v9, twiddle((double)-4.349897520355098 * k / p)); | |
v10 = mul(v10, twiddle((double)-4.833219467061221 * k / p)); | |
v11 = mul(v11, twiddle((double)-5.316541413767341 * k / p)); | |
v12 = mul(v12, twiddle((double)-5.799863360473465 * k / p)); | |
} | |
dft13(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12); | |
const size_t j = k + (i - k) * 13; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
y[8 * p] = v8; | |
y[9 * p] = v9; | |
y[10 * p] = v10; | |
y[11 * p] = v11; | |
y[12 * p] = v12; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 2 -name dft2 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 4 FP additions, 0 FP multiplications, | |
* (or, 4 additions, 0 multiplications, 0 fused multiply/add), | |
* 6 stack variables, 0 constants, and 8 memory accesses | |
*/ | |
DEVICE void | |
dft2 (real2_t * u0, real2_t * u1) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
{ | |
{ | |
real_t T3, T1, T2, T4; | |
T3 = v0.y; | |
T1 = v0.x; | |
T2 = v1.x; | |
T4 = v1.y; | |
v0.x = T1 + T2; | |
v0.y = T3 + T4; | |
v1.y = T3 - T4; | |
v1.x = T1 - T2; | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 2; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-3.141592653589793 * k / p)); | |
} | |
dft2(&v0, &v1); | |
const size_t j = k + (i - k) * 2; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T5, T1, T6, T2, T3, T7; | |
T5 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Ta, T8, Tc, Tb, T9; | |
T4 = T2 + T3; | |
Ta = T2 - T3; | |
T8 = T6 + T7; | |
Tc = T7 - T6; | |
Tb = fma (-KP500000000, T4, T1); | |
T9 = fma (-KP500000000, T8, T5); | |
v0.y = T5 + T8; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, Tc, Tb); | |
v2.y = fma (-KP866025403, Ta, T9); | |
v1.y = fma (KP866025403, Ta, T9); | |
v1.x = fma (KP866025403, Tc, Tb); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 11 -name dft11 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 140 FP additions, 110 FP multiplications, | |
* (or, 30 additions, 0 multiplications, 110 fused multiply/add), | |
* 96 stack variables, 10 constants, and 44 memory accesses | |
*/ | |
DEVICE void | |
dft11 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9, | |
real2_t * u10) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
real2_t v8 = *u8; | |
real2_t v9 = *u9; | |
real2_t v10 = *u10; | |
{ | |
const real_t KP989821441 = | |
+0.989821441880932732376092037776718787376519372; | |
const real_t KP959492973 = | |
+0.959492973614497389890368057066327699062454848; | |
const real_t KP918985947 = | |
+0.918985947228994779780736114132655398124909697; | |
const real_t KP876768831 = | |
+0.876768831002589333891339807079336796764054852; | |
const real_t KP830830026 = | |
+0.830830026003772851058548298459246407048009821; | |
const real_t KP778434453 = | |
+0.778434453334651800608337670740821884709317477; | |
const real_t KP634356270 = | |
+0.634356270682424498893150776899916060542806975; | |
const real_t KP715370323 = | |
+0.715370323453429719112414662767260662417897278; | |
const real_t KP342584725 = | |
+0.342584725681637509502641509861112333758894680; | |
const real_t KP521108558 = | |
+0.521108558113202722944698153526659300680427422; | |
{ | |
real_t Th, TE, T1p, T1y, T1f, T1j, T1g, T1k; | |
{ | |
real_t T1, TG, T4, TC, Tg, TF, T7, Ta, TD, Td, TI, T1S, T1J, | |
TR, T10; | |
real_t T21, T1m, T19, T1A, T1i, T1t, Tk, T1u, Tw, T1r, Tn, | |
T1q, Tq, T1s, Tt; | |
real_t T26, TV, TM, Ty, T1X, T1O, T1w, T13, T1E, T1c; | |
Th = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Ti, Tj, Tu, Tv, Tl, Tm, To, Tp, Tr, Ts; | |
{ | |
real_t Tb, Tc, TH, T1R; | |
{ | |
real_t T2, T3, Te, Tf; | |
Ti = v1.y; | |
T2 = v1.x; | |
T3 = v10.x; | |
Tj = v10.y; | |
Tu = v5.y; | |
Te = v5.x; | |
Tf = v6.x; | |
Tv = v6.y; | |
{ | |
real_t T5, T6, T8, T9; | |
Tl = v2.y; | |
T5 = v2.x; | |
TG = T2 - T3; | |
T4 = T2 + T3; | |
TC = Te - Tf; | |
Tg = Te + Tf; | |
T6 = v9.x; | |
Tm = v9.y; | |
To = v3.y; | |
T8 = v3.x; | |
T9 = v8.x; | |
Tp = v8.y; | |
Tr = v4.y; | |
Tb = v4.x; | |
TF = T5 - T6; | |
T7 = T5 + T6; | |
TE = T8 - T9; | |
Ta = T8 + T9; | |
Tc = v7.x; | |
Ts = v7.y; | |
} | |
} | |
TH = fma (-KP521108558, TG, TF); | |
T1R = fma (-KP342584725, T7, Tg); | |
{ | |
real_t T1l, T18, T1z, T1h; | |
{ | |
real_t TQ, TZ, T20, T1I; | |
T1I = fma (-KP342584725, T4, Ta); | |
TD = Tb - Tc; | |
Td = Tb + Tc; | |
TI = fma (-KP715370323, TH, TE); | |
T1S = fma (-KP634356270, T1R, T4); | |
TQ = fma (KP521108558, TD, TE); | |
TZ = fma (-KP521108558, TF, TD); | |
T20 = fma (-KP342584725, Tg, Td); | |
T1J = fma (-KP634356270, T1I, Tg); | |
TR = fma (KP715370323, TQ, TG); | |
T10 = fma (KP715370323, TZ, TC); | |
T21 = fma (-KP634356270, T20, Ta); | |
T1l = fma (-KP342584725, Ta, T7); | |
} | |
T18 = fma (KP521108558, TE, TC); | |
T1z = fma (-KP342584725, Td, T4); | |
T1h = fma (KP521108558, TC, TG); | |
T1m = fma (-KP634356270, T1l, Td); | |
T19 = fma (-KP715370323, T18, TF); | |
T1A = fma (-KP634356270, T1z, T7); | |
T1i = fma (KP715370323, T1h, TD); | |
} | |
} | |
T1t = Tj - Ti; | |
Tk = Ti + Tj; | |
T1u = Tv - Tu; | |
Tw = Tu + Tv; | |
T1r = Tm - Tl; | |
Tn = Tl + Tm; | |
T1q = Tp - To; | |
Tq = To + Tp; | |
{ | |
real_t Tx, T1W, T25, TL, TU, T1N, T1v; | |
T25 = fma (-KP521108558, T1t, T1r); | |
TL = fma (-KP342584725, Tn, Tw); | |
TU = fma (-KP342584725, Tk, Tq); | |
T1s = Ts - Tr; | |
Tt = Tr + Ts; | |
T26 = fma (-KP715370323, T25, T1q); | |
TV = fma (-KP634356270, TU, Tw); | |
TM = fma (-KP634356270, TL, Tk); | |
Tx = fma (-KP342584725, Tw, Tt); | |
T1W = fma (KP521108558, T1s, T1q); | |
T1N = fma (-KP521108558, T1r, T1s); | |
T1v = fma (KP521108558, T1u, T1t); | |
Ty = fma (-KP634356270, Tx, Tq); | |
T1X = fma (KP715370323, T1W, T1t); | |
T1O = fma (KP715370323, T1N, T1u); | |
T1w = fma (KP715370323, T1v, T1s); | |
T13 = fma (-KP342584725, Tt, Tk); | |
T1E = fma (KP521108558, T1q, T1u); | |
T1c = fma (-KP342584725, Tq, Tn); | |
} | |
} | |
{ | |
real_t T14, T1F, T1d, T1Z, T1V, TN, TS, Tz, TJ; | |
T14 = fma (-KP634356270, T13, Tn); | |
T1F = fma (-KP715370323, T1E, T1r); | |
T1d = fma (-KP634356270, T1c, Tt); | |
v0.y = Th + Tk + Tn + Tq + Tt + Tw; | |
v0.x = T1 + T4 + T7 + Ta + Td + Tg; | |
Tz = fma (-KP778434453, Ty, Tn); | |
TJ = fma (-KP830830026, TI, TD); | |
{ | |
real_t TK, T23, T28, TB, TA, T22, T27; | |
T22 = fma (-KP778434453, T21, T7); | |
T27 = fma (-KP830830026, T26, T1s); | |
TA = fma (-KP876768831, Tz, Tk); | |
TK = fma (-KP918985947, TJ, TC); | |
T23 = fma (-KP876768831, T22, T4); | |
T28 = fma (-KP918985947, T27, T1u); | |
TB = fma (-KP959492973, TA, Th); | |
{ | |
real_t T1U, T1T, T24, T1Y; | |
T1T = fma (-KP778434453, T1S, Ta); | |
T24 = fma (-KP959492973, T23, T1); | |
T1Y = fma (-KP830830026, T1X, T1u); | |
T1U = fma (-KP876768831, T1T, Td); | |
v5.x = fma (KP989821441, T28, T24); | |
v5.y = fma (KP989821441, TK, TB); | |
v6.y = fma (-KP989821441, TK, TB); | |
v6.x = fma (-KP989821441, T28, T24); | |
T1Z = fma (KP918985947, T1Y, T1r); | |
T1V = fma (-KP959492973, T1U, T1); | |
} | |
TN = fma (-KP778434453, TM, Tq); | |
TS = fma (-KP830830026, TR, TC); | |
} | |
{ | |
real_t TY, T12, T1M, T15, T1a, T1Q; | |
{ | |
real_t TT, TX, T11, TP, TO, TW, T1P, T1L, T1K; | |
TW = fma (-KP778434453, TV, Tt); | |
TO = fma (-KP876768831, TN, Tt); | |
TT = fma (KP918985947, TS, TF); | |
TX = fma (-KP876768831, TW, Tn); | |
T11 = fma (-KP830830026, T10, TE); | |
TP = fma (-KP959492973, TO, Th); | |
T1K = fma (-KP778434453, T1J, Td); | |
TY = fma (-KP959492973, TX, Th); | |
T12 = fma (-KP918985947, T11, TG); | |
v7.x = fma (KP989821441, T1Z, T1V); | |
v7.y = fma (KP989821441, TT, TP); | |
v4.y = fma (-KP989821441, TT, TP); | |
v4.x = fma (-KP989821441, T1Z, T1V); | |
T1L = fma (-KP876768831, T1K, T7); | |
T1P = fma (-KP830830026, T1O, T1q); | |
T1M = fma (-KP959492973, T1L, T1); | |
T15 = fma (-KP778434453, T14, Tw); | |
T1a = fma (KP830830026, T19, TG); | |
T1Q = fma (-KP918985947, T1P, T1t); | |
} | |
{ | |
real_t T1b, T17, T1C, T1G, T1B, T16; | |
T1B = fma (-KP778434453, T1A, Tg); | |
T16 = fma (-KP876768831, T15, Tq); | |
T1b = fma (-KP918985947, T1a, TD); | |
v3.x = fma (KP989821441, T1Q, T1M); | |
v3.y = fma (KP989821441, T12, TY); | |
v8.y = fma (-KP989821441, T12, TY); | |
v8.x = fma (-KP989821441, T1Q, T1M); | |
T17 = fma (-KP959492973, T16, Th); | |
T1C = fma (-KP876768831, T1B, Ta); | |
T1G = fma (KP830830026, T1F, T1t); | |
{ | |
real_t T1D, T1H, T1o, T1x, T1n, T1e; | |
T1n = fma (-KP778434453, T1m, T4); | |
T1D = fma (-KP959492973, T1C, T1); | |
T1H = fma (-KP918985947, T1G, T1s); | |
T1o = fma (-KP876768831, T1n, Tg); | |
T1x = fma (KP830830026, T1w, T1r); | |
T1e = fma (-KP778434453, T1d, Tk); | |
v9.y = fma (KP989821441, T1b, T17); | |
v9.x = fma (KP989821441, T1H, T1D); | |
v2.x = fma (-KP989821441, T1H, T1D); | |
v2.y = fma (-KP989821441, T1b, T17); | |
T1p = fma (-KP959492973, T1o, T1); | |
T1y = fma (KP918985947, T1x, T1q); | |
T1f = fma (-KP876768831, T1e, Tw); | |
T1j = fma (KP830830026, T1i, TF); | |
} | |
} | |
} | |
} | |
} | |
T1g = fma (-KP959492973, T1f, Th); | |
T1k = fma (KP918985947, T1j, TE); | |
v10.x = fma (-KP989821441, T1y, T1p); | |
v10.y = fma (-KP989821441, T1k, T1g); | |
v1.y = fma (KP989821441, T1k, T1g); | |
v1.x = fma (KP989821441, T1y, T1p); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
*u8 = v8; | |
*u9 = v9; | |
*u10 = v10; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 11; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
double2 v8 = x[8 * threads]; | |
double2 v9 = x[9 * threads]; | |
double2 v10 = x[10 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.5711986642890533 * k / p)); | |
v2 = mul(v2, twiddle((double)-1.142397328578107 * k / p)); | |
v3 = mul(v3, twiddle((double)-1.71359599286716 * k / p)); | |
v4 = mul(v4, twiddle((double)-2.284794657156213 * k / p)); | |
v5 = mul(v5, twiddle((double)-2.855993321445267 * k / p)); | |
v6 = mul(v6, twiddle((double)-3.42719198573432 * k / p)); | |
v7 = mul(v7, twiddle((double)-3.998390650023373 * k / p)); | |
v8 = mul(v8, twiddle((double)-4.569589314312426 * k / p)); | |
v9 = mul(v9, twiddle((double)-5.140787978601479 * k / p)); | |
v10 = mul(v10, twiddle((double)-5.711986642890533 * k / p)); | |
} | |
dft11(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10); | |
const size_t j = k + (i - k) * 11; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
y[8 * p] = v8; | |
y[9 * p] = v9; | |
y[10 * p] = v10; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 13 -name dft13 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 176 FP additions, 114 FP multiplications, | |
* (or, 62 additions, 0 multiplications, 114 fused multiply/add), | |
* 114 stack variables, 25 constants, and 52 memory accesses | |
*/ | |
DEVICE void | |
dft13 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4, | |
real2_t * u5, real2_t * u6, real2_t * u7, real2_t * u8, real2_t * u9, | |
real2_t * u10, real2_t * u11, real2_t * u12) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
real2_t v5 = *u5; | |
real2_t v6 = *u6; | |
real2_t v7 = *u7; | |
real2_t v8 = *u8; | |
real2_t v9 = *u9; | |
real2_t v10 = *u10; | |
real2_t v11 = *u11; | |
real2_t v12 = *u12; | |
{ | |
const real_t KP600477271 = | |
+0.600477271932665282925769253334763009352012849; | |
const real_t KP875502302 = | |
+0.875502302409147941146295545768755143177842006; | |
const real_t KP520028571 = | |
+0.520028571888864619117130500499232802493238139; | |
const real_t KP575140729 = | |
+0.575140729474003121368385547455453388461001608; | |
const real_t KP300462606 = | |
+0.300462606288665774426601772289207995520941381; | |
const real_t KP516520780 = | |
+0.516520780623489722840901288569017135705033622; | |
const real_t KP968287244 = | |
+0.968287244361984016049539446938120421179794516; | |
const real_t KP503537032 = | |
+0.503537032863766627246873853868466977093348562; | |
const real_t KP251768516 = | |
+0.251768516431883313623436926934233488546674281; | |
const real_t KP581704778 = | |
+0.581704778510515730456870384989698884939833902; | |
const real_t KP859542535 = | |
+0.859542535098774820163672132761689612766401925; | |
const real_t KP083333333 = | |
+0.083333333333333333333333333333333333333333333; | |
const real_t KP957805992 = | |
+0.957805992594665126462521754605754580515587217; | |
const real_t KP522026385 = | |
+0.522026385161275033714027226654165028300441940; | |
const real_t KP853480001 = | |
+0.853480001859823990758994934970528322872359049; | |
const real_t KP769338817 = | |
+0.769338817572980603471413688209101117038278899; | |
const real_t KP612264650 = | |
+0.612264650376756543746494474777125408779395514; | |
const real_t KP038632954 = | |
+0.038632954644348171955506895830342264440241080; | |
const real_t KP302775637 = | |
+0.302775637731994646559610633735247973125648287; | |
const real_t KP514918778 = | |
+0.514918778086315755491789696138117261566051239; | |
const real_t KP686558370 = | |
+0.686558370781754340655719594850823015421401653; | |
const real_t KP226109445 = | |
+0.226109445035782405468510155372505010481906348; | |
const real_t KP301479260 = | |
+0.301479260047709873958013540496673347309208464; | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T1C, T1A, T1t, T1B, T2B, T2H, T2I, T2G; | |
{ | |
real_t T1P, T1, T2n, T2o, To, TH, T2h, T2k, TE, TB, TF, Tw, | |
T2j, T2c, T1m; | |
real_t T1W, T1X, T1c, T19, T1j, T12, T1f, T21, T24, T27, T1U; | |
T1P = v0.y; | |
T1 = v0.x; | |
{ | |
real_t TK, TL, T16, TY, TZ, T13, TW, TV, TN, TO, TQ, TR, | |
T2b, Tv, Ts; | |
real_t T2a; | |
{ | |
real_t T2d, Tf, Tq, Ty, Tb, Tr, T6, Tx, Ti, Tt, Tu, | |
Tl; | |
{ | |
real_t T7, T8, T9, Td, Te; | |
TK = v8.y; | |
Td = v8.x; | |
Te = v5.x; | |
TL = v5.y; | |
T16 = v12.y; | |
T7 = v12.x; | |
T8 = v10.x; | |
TY = v10.y; | |
TZ = v4.y; | |
T9 = v4.x; | |
T2d = Td - Te; | |
Tf = Td + Te; | |
{ | |
real_t T2, Ta, T3, T4; | |
T2 = v1.x; | |
T13 = v1.y; | |
Ta = T8 + T9; | |
Tq = T8 - T9; | |
TW = v3.y; | |
T3 = v3.x; | |
T4 = v9.x; | |
TV = v9.y; | |
{ | |
real_t Tg, T5, Th, Tj, Tk; | |
TN = v11.y; | |
Tg = v11.x; | |
Ty = fma (KP500000000, Ta, -(T7)); | |
Tb = T7 + Ta; | |
Tr = T4 - T3; | |
T5 = T3 + T4; | |
Th = v6.x; | |
TO = v6.y; | |
TQ = v7.y; | |
Tj = v7.x; | |
Tk = v2.x; | |
TR = v2.y; | |
T6 = T2 + T5; | |
Tx = fma (-KP500000000, T5, T2); | |
Ti = Tg + Th; | |
Tt = Tg - Th; | |
Tu = Tj - Tk; | |
Tl = Tj + Tk; | |
} | |
} | |
} | |
{ | |
real_t Tc, Tm, T2e, T2g; | |
Tc = T6 + Tb; | |
T2n = T6 - Tb; | |
T2b = Ti - Tl; | |
Tm = Ti + Tl; | |
T2e = Tt + Tu; | |
Tv = Tt - Tu; | |
Ts = Tq - Tr; | |
T2g = Tr + Tq; | |
{ | |
real_t Tz, TA, Tn, T2f; | |
Tz = Tx - Ty; | |
T2a = Tx + Ty; | |
TA = fma (-KP500000000, Tm, Tf); | |
Tn = Tf + Tm; | |
T2f = fma (-KP500000000, T2e, T2d); | |
T2o = T2d + T2e; | |
To = Tc + Tn; | |
TH = Tc - Tn; | |
T2h = fma (KP866025403, T2g, T2f); | |
T2k = fma (-KP866025403, T2g, T2f); | |
TE = Tz - TA; | |
TB = Tz + TA; | |
} | |
} | |
} | |
{ | |
real_t T1R, TM, T10, T18, T1l, TX, T1k, T15, TP, T1a, | |
T1b, TS, T17, T14; | |
TF = Ts - Tv; | |
Tw = Ts + Tv; | |
T2j = fma (-KP866025403, T2b, T2a); | |
T2c = fma (KP866025403, T2b, T2a); | |
T1R = TK + TL; | |
TM = TK - TL; | |
T17 = TY + TZ; | |
T10 = TY - TZ; | |
T18 = fma (KP500000000, T17, -(T16)); | |
T1l = T16 + T17; | |
TX = TV - TW; | |
T14 = TW + TV; | |
T1k = T13 + T14; | |
T15 = fma (-KP500000000, T14, T13); | |
TP = TN - TO; | |
T1a = TN + TO; | |
T1b = TQ + TR; | |
TS = TQ - TR; | |
{ | |
real_t T1Q, T11, TT, T1S; | |
T1Q = T1k + T1l; | |
T1m = T1k - T1l; | |
T11 = TX + T10; | |
T1W = T10 - TX; | |
T1X = TP - TS; | |
TT = TP + TS; | |
T1S = T1a + T1b; | |
T1c = T1a - T1b; | |
{ | |
real_t T1Z, TU, T1T, T20; | |
T19 = T15 + T18; | |
T1Z = T15 - T18; | |
T1j = TM + TT; | |
TU = fma (-KP500000000, TT, TM); | |
T1T = T1R + T1S; | |
T20 = fma (-KP500000000, T1S, T1R); | |
T12 = fma (KP866025403, T11, TU); | |
T1f = fma (-KP866025403, T11, TU); | |
T21 = T1Z + T20; | |
T24 = T1Z - T20; | |
T27 = T1Q - T1T; | |
T1U = T1Q + T1T; | |
} | |
} | |
} | |
} | |
{ | |
real_t T1g, T1d, T25, T1Y; | |
T1g = fma (-KP866025403, T1c, T19); | |
T1d = fma (KP866025403, T1c, T19); | |
T25 = T1W - T1X; | |
T1Y = T1W + T1X; | |
v0.y = T1P + T1U; | |
v0.x = T1 + To; | |
{ | |
real_t T1O, T1o, TJ, T1N, T1L, T1F, T1K, T1M; | |
{ | |
real_t TC, T1J, T1z, T1w, T1I, Tp, T1E, T1q, TI, | |
T1s; | |
{ | |
real_t TG, T1n, T1G, T1u, T1e, T1h, T1v, T1x, | |
T1y, T1H, T1i; | |
TC = fma (KP301479260, TB, Tw); | |
T1x = fma (-KP226109445, Tw, TB); | |
T1y = fma (KP686558370, TE, TF); | |
TG = fma (-KP514918778, TF, TE); | |
T1n = fma (-KP302775637, T1m, T1j); | |
T1G = fma (KP302775637, T1j, T1m); | |
T1u = fma (-KP038632954, T12, T1d); | |
T1e = fma (KP038632954, T1d, T12); | |
T1h = fma (KP612264650, T1g, T1f); | |
T1v = fma (-KP612264650, T1f, T1g); | |
T1J = fma (KP769338817, T1y, T1x); | |
T1z = fma (-KP769338817, T1y, T1x); | |
T1H = fma (-KP853480001, T1v, T1u); | |
T1w = fma (KP853480001, T1v, T1u); | |
T1I = fma (-KP522026385, T1H, T1G); | |
T1O = fma (KP957805992, T1G, T1H); | |
Tp = fma (-KP083333333, To, T1); | |
T1E = fma (KP853480001, T1h, T1e); | |
T1i = fma (-KP853480001, T1h, T1e); | |
T1q = fma (-KP859542535, TG, TH); | |
TI = fma (KP581704778, TH, TG); | |
T1o = fma (KP957805992, T1n, T1i); | |
T1s = fma (-KP522026385, T1i, T1n); | |
} | |
{ | |
real_t T1D, T1p, TD, T1r; | |
T1p = fma (-KP251768516, TC, Tp); | |
TD = fma (KP503537032, TC, Tp); | |
T1C = fma (KP968287244, T1z, T1w); | |
T1A = fma (-KP968287244, T1z, T1w); | |
TJ = fma (KP516520780, TI, TD); | |
T1N = fma (-KP516520780, TI, TD); | |
T1D = fma (-KP300462606, T1q, T1p); | |
T1r = fma (KP300462606, T1q, T1p); | |
T1t = fma (KP575140729, T1s, T1r); | |
T1B = fma (-KP575140729, T1s, T1r); | |
T1L = fma (KP520028571, T1E, T1D); | |
T1F = fma (-KP520028571, T1E, T1D); | |
T1K = fma (-KP875502302, T1J, T1I); | |
T1M = fma (KP875502302, T1J, T1I); | |
} | |
} | |
{ | |
real_t T22, T2F, T2N, T2K, T2w, T2A, T1V, T2C, | |
T28, T2y, T2M, T2q; | |
{ | |
real_t T26, T2v, T2p, T2i, T2s, T2t, T2l, T2D, | |
T2E, T2u, T2m; | |
T2D = fma (-KP226109445, T1Y, T21); | |
T22 = fma (KP301479260, T21, T1Y); | |
T26 = fma (-KP514918778, T25, T24); | |
T2E = fma (KP686558370, T24, T25); | |
T2v = fma (-KP302775637, T2n, T2o); | |
T2p = fma (KP302775637, T2o, T2n); | |
T2i = fma (-KP038632954, T2h, T2c); | |
T2s = fma (KP038632954, T2c, T2h); | |
T2t = fma (KP612264650, T2j, T2k); | |
T2l = fma (-KP612264650, T2k, T2j); | |
T2F = fma (-KP769338817, T2E, T2D); | |
T2N = fma (KP769338817, T2E, T2D); | |
T2K = fma (KP853480001, T2t, T2s); | |
T2u = fma (-KP853480001, T2t, T2s); | |
T2w = fma (KP957805992, T2v, T2u); | |
T2A = fma (-KP522026385, T2u, T2v); | |
T1V = fma (-KP083333333, T1U, T1P); | |
T2m = fma (-KP853480001, T2l, T2i); | |
T2C = fma (KP853480001, T2l, T2i); | |
T28 = fma (KP581704778, T27, T26); | |
T2y = fma (-KP859542535, T26, T27); | |
T2M = fma (-KP522026385, T2m, T2p); | |
T2q = fma (KP957805992, T2p, T2m); | |
} | |
{ | |
real_t T2O, T2Q, T2z, T2P, T2L; | |
{ | |
real_t T23, T2x, T2r, T29, T2J; | |
T23 = fma (KP503537032, T22, T1V); | |
T2x = fma (-KP251768516, T22, T1V); | |
T2O = fma (-KP875502302, T2N, T2M); | |
T2Q = fma (KP875502302, T2N, T2M); | |
T2r = fma (KP516520780, T28, T23); | |
T29 = fma (-KP516520780, T28, T23); | |
T2z = fma (KP300462606, T2y, T2x); | |
T2J = fma (-KP300462606, T2y, T2x); | |
v12.x = fma (-KP600477271, T1o, TJ); | |
v12.y = fma (KP600477271, T2w, T2r); | |
v1.y = fma (-KP600477271, T2w, T2r); | |
v1.x = fma (KP600477271, T1o, TJ); | |
v8.x = fma (KP600477271, T1O, T1N); | |
v8.y = fma (-KP600477271, T2q, T29); | |
v5.y = fma (KP600477271, T2q, T29); | |
v5.x = fma (-KP600477271, T1O, T1N); | |
T2P = fma (KP520028571, T2K, T2J); | |
T2L = fma (-KP520028571, T2K, T2J); | |
} | |
T2B = fma (KP575140729, T2A, T2z); | |
T2H = fma (-KP575140729, T2A, T2z); | |
v2.x = fma (-KP575140729, T1K, T1F); | |
v2.y = fma (KP575140729, T2Q, T2P); | |
v7.y = fma (-KP575140729, T2Q, T2P); | |
v7.x = fma (KP575140729, T1K, T1F); | |
v6.x = fma (-KP575140729, T1M, T1L); | |
v6.y = fma (KP575140729, T2O, T2L); | |
v11.y = fma (-KP575140729, T2O, T2L); | |
v11.x = fma (KP575140729, T1M, T1L); | |
T2I = fma (KP968287244, T2F, T2C); | |
T2G = fma (-KP968287244, T2F, T2C); | |
} | |
} | |
} | |
} | |
} | |
v3.x = fma (-KP520028571, T1A, T1t); | |
v3.y = fma (KP520028571, T2I, T2H); | |
v9.y = fma (-KP520028571, T2I, T2H); | |
v9.x = fma (KP520028571, T1A, T1t); | |
v4.x = fma (-KP520028571, T1C, T1B); | |
v4.y = fma (KP520028571, T2G, T2B); | |
v10.y = fma (-KP520028571, T2G, T2B); | |
v10.x = fma (KP520028571, T1C, T1B); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
*u5 = v5; | |
*u6 = v6; | |
*u7 = v7; | |
*u8 = v8; | |
*u9 = v9; | |
*u10 = v10; | |
*u11 = v11; | |
*u12 = v12; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 13; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
double2 v5 = x[5 * threads]; | |
double2 v6 = x[6 * threads]; | |
double2 v7 = x[7 * threads]; | |
double2 v8 = x[8 * threads]; | |
double2 v9 = x[9 * threads]; | |
double2 v10 = x[10 * threads]; | |
double2 v11 = x[11 * threads]; | |
double2 v12 = x[12 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-0.483321946706122 * k / p)); | |
v2 = mul(v2, twiddle((double)-0.966643893412244 * k / p)); | |
v3 = mul(v3, twiddle((double)-1.449965840118366 * k / p)); | |
v4 = mul(v4, twiddle((double)-1.933287786824488 * k / p)); | |
v5 = mul(v5, twiddle((double)-2.41660973353061 * k / p)); | |
v6 = mul(v6, twiddle((double)-2.899931680236732 * k / p)); | |
v7 = mul(v7, twiddle((double)-3.383253626942854 * k / p)); | |
v8 = mul(v8, twiddle((double)-3.866575573648976 * k / p)); | |
v9 = mul(v9, twiddle((double)-4.349897520355098 * k / p)); | |
v10 = mul(v10, twiddle((double)-4.833219467061221 * k / p)); | |
v11 = mul(v11, twiddle((double)-5.316541413767341 * k / p)); | |
v12 = mul(v12, twiddle((double)-5.799863360473465 * k / p)); | |
} | |
dft13(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12); | |
const size_t j = k + (i - k) * 13; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
y[5 * p] = v5; | |
y[6 * p] = v6; | |
y[7 * p] = v7; | |
y[8 * p] = v8; | |
y[9 * p] = v9; | |
y[10 * p] = v10; | |
y[11 * p] = v11; | |
y[12 * p] = v12; | |
} | |
/tmp/vexcl/tests/fft.cpp:94: error in "test_dimensions": absolute value of rms(back, inp){0.52512901785634258} exceeds 1e-08 | |
FFT(C2C) size=54x58 batch=1 | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 2 -name dft2 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 4 FP additions, 0 FP multiplications, | |
* (or, 4 additions, 0 multiplications, 0 fused multiply/add), | |
* 6 stack variables, 0 constants, and 8 memory accesses | |
*/ | |
DEVICE void | |
dft2 (real2_t * u0, real2_t * u1) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
{ | |
{ | |
real_t T3, T1, T2, T4; | |
T3 = v0.y; | |
T1 = v0.x; | |
T2 = v1.x; | |
T4 = v1.y; | |
v0.x = T1 + T2; | |
v0.y = T3 + T4; | |
v1.y = T3 - T4; | |
v1.x = T1 - T2; | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 2; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-3.141592653589793 * k / p)); | |
} | |
dft2(&v0, &v1); | |
const size_t j = k + (i - k) * 2; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
} | |
kernel void bluestein_twiddle | |
( | |
ulong n, | |
global double2 * output | |
) | |
{ | |
const size_t x = get_global_id(0); | |
const size_t xx = ((ulong)x * x) % (2 * n); | |
if (x < n) output[x] = twiddle(-3.141592653589793 * xx / n); | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 conj | |
( | |
double2 v | |
) | |
{ | |
double2 r = {v.x, -v.y}; | |
return r; | |
} | |
kernel void bluestein_pad_kernel | |
( | |
global const double2 * input, | |
global double2 * output, | |
uint n, | |
uint m | |
) | |
{ | |
const uint x = get_global_id(0); | |
if (x < m) | |
{ | |
if(x < n || m - x < n) | |
{ | |
output[x] = conj(input[min(x, m - x)]); | |
} | |
else | |
{ | |
double2 r = {0,0}; | |
output[x] = r; | |
} | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td; | |
{ | |
real_t T7, T1, T2, T8; | |
T7 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T8 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T7 + T8; | |
T9 = T7 - T8; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, Ta, Te, Tg; | |
T6 = T4 + T5; | |
Ta = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta + T9; | |
v3.x = Tb - Te; | |
v1.x = Tb + Te; | |
v1.y = T9 - Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T9, T1, T6, T2, T3, T7; | |
T9 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Tc, T8, Ta, T5, Tb; | |
T4 = T2 + T3; | |
Tc = T3 - T2; | |
T8 = T6 - T7; | |
Ta = T6 + T7; | |
T5 = fma (-KP500000000, T4, T1); | |
Tb = fma (-KP500000000, Ta, T9); | |
v0.y = T9 + Ta; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, T8, T5); | |
v2.y = fma (-KP866025403, Tc, Tb); | |
v1.y = fma (KP866025403, Tc, Tb); | |
v1.x = fma (KP866025403, T8, T5); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (-KP951056516, Tk, Tj); | |
v2.y = fma (KP951056516, Tw, Tv); | |
v3.y = fma (-KP951056516, Tw, Tv); | |
v3.x = fma (KP951056516, Tk, Tj); | |
v4.x = fma (-KP951056516, Ti, Tb); | |
v4.y = fma (KP951056516, Tu, Tr); | |
v1.y = fma (-KP951056516, Tu, Tr); | |
v1.x = fma (KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
} | |
kernel void bluestein_mul_in | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
uint radix, | |
uint p, | |
uint out_stride | |
) | |
{ | |
const size_t thread = get_global_id(0); | |
const size_t threads = get_global_size(0); | |
const size_t batch = get_global_id(1); | |
const size_t element = get_global_id(2); | |
if(element < out_stride) | |
{ | |
const size_t in_off = thread + batch * radix * threads + element * threads; | |
const size_t out_off = thread * out_stride + batch * out_stride * threads + element; | |
if(element < radix) | |
{ | |
double2 w = exp[element]; | |
if(p != 1) | |
{ | |
ulong a = (ulong)element * (thread % p); | |
ulong b = (ulong)radix * p; | |
double2 t = twiddle(-6.283185307179586 * (a % (2 * b)) / b); | |
w = mul(w, t); | |
} | |
output[out_off] = mul(data[in_off], w); | |
} | |
else | |
{ | |
double2 r = {0,0}; | |
output[out_off] = r; | |
} | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, T9, T5, Td; | |
{ | |
real_t T7, T1, T2, T8; | |
T7 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T8 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T7 + T8; | |
T9 = T7 - T8; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, Ta, Te, Tg; | |
T6 = T4 + T5; | |
Ta = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta + T9; | |
v3.x = Tb - Te; | |
v1.x = Tb + Te; | |
v1.y = T9 - Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T9, T1, T6, T2, T3, T7; | |
T9 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Tc, T8, Ta, T5, Tb; | |
T4 = T2 + T3; | |
Tc = T3 - T2; | |
T8 = T6 - T7; | |
Ta = T6 + T7; | |
T5 = fma (-KP500000000, T4, T1); | |
Tb = fma (-KP500000000, Ta, T9); | |
v0.y = T9 + Ta; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, T8, T5); | |
v2.y = fma (-KP866025403, Tc, Tb); | |
v1.y = fma (KP866025403, Tc, Tb); | |
v1.x = fma (KP866025403, T8, T5); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (-KP951056516, Tk, Tj); | |
v2.y = fma (KP951056516, Tw, Tv); | |
v3.y = fma (-KP951056516, Tw, Tv); | |
v3.x = fma (KP951056516, Tk, Tj); | |
v4.x = fma (-KP951056516, Ti, Tb); | |
v4.y = fma (KP951056516, Tu, Tr); | |
v1.y = fma (-KP951056516, Tu, Tr); | |
v1.x = fma (KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
kernel void bluestein_mul | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
uint stride | |
) | |
{ | |
const size_t x = get_global_id(0); | |
const size_t y = get_global_id(1); | |
if(x < stride) | |
{ | |
const size_t off = x + stride * y; | |
output[off] = mul(data[off], exp[x]); | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 4 -name dft4 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 16 FP additions, 0 FP multiplications, | |
* (or, 16 additions, 0 multiplications, 0 fused multiply/add), | |
* 16 stack variables, 0 constants, and 16 memory accesses | |
*/ | |
DEVICE void | |
dft4 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
{ | |
{ | |
real_t Tc, T4, Tb, T3, Tf, Ta, T5, Td; | |
{ | |
real_t T8, T1, T2, T9; | |
T8 = v0.y; | |
T1 = v0.x; | |
T2 = v2.x; | |
T9 = v2.y; | |
Tc = v1.y; | |
T4 = v1.x; | |
Tb = T1 - T2; | |
T3 = T1 + T2; | |
Tf = T8 + T9; | |
Ta = T8 - T9; | |
T5 = v3.x; | |
Td = v3.y; | |
} | |
{ | |
real_t T6, T7, Te, Tg; | |
T6 = T4 + T5; | |
T7 = T4 - T5; | |
Te = Tc - Td; | |
Tg = Tc + Td; | |
v0.x = T3 + T6; | |
v0.y = Tf + Tg; | |
v2.y = Tf - Tg; | |
v2.x = T3 - T6; | |
v3.y = Ta - T7; | |
v3.x = Tb + Te; | |
v1.x = Tb - Te; | |
v1.y = T7 + Ta; | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 4; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.570796326794897 * k / p)); | |
v2 = mul(v2, twiddle((double)-3.141592653589793 * k / p)); | |
v3 = mul(v3, twiddle((double)-4.71238898038469 * k / p)); | |
} | |
dft4(&v0, &v1, &v2, &v3); | |
const size_t j = k + (i - k) * 4; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 3 -name dft3 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 12 FP additions, 6 FP multiplications, | |
* (or, 6 additions, 0 multiplications, 6 fused multiply/add), | |
* 17 stack variables, 2 constants, and 12 memory accesses | |
*/ | |
DEVICE void | |
dft3 (real2_t * u0, real2_t * u1, real2_t * u2) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
{ | |
const real_t KP866025403 = | |
+0.866025403784438646763723170752936183471402627; | |
const real_t KP500000000 = | |
+0.500000000000000000000000000000000000000000000; | |
{ | |
real_t T5, T1, T6, T2, T3, T7; | |
T5 = v0.y; | |
T1 = v0.x; | |
T6 = v1.y; | |
T2 = v1.x; | |
T3 = v2.x; | |
T7 = v2.y; | |
{ | |
real_t T4, Ta, T8, Tc, Tb, T9; | |
T4 = T2 + T3; | |
Ta = T2 - T3; | |
T8 = T6 + T7; | |
Tc = T7 - T6; | |
Tb = fma (-KP500000000, T4, T1); | |
T9 = fma (-KP500000000, T8, T5); | |
v0.y = T5 + T8; | |
v0.x = T1 + T4; | |
v2.x = fma (-KP866025403, Tc, Tb); | |
v2.y = fma (-KP866025403, Ta, T9); | |
v1.y = fma (KP866025403, Ta, T9); | |
v1.x = fma (KP866025403, Tc, Tb); | |
} | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 3; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-2.094395102393195 * k / p)); | |
v2 = mul(v2, twiddle((double)-4.188790204786391 * k / p)); | |
} | |
dft3(&v0, &v1, &v2); | |
const size_t j = k + (i - k) * 3; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 5 -name dft5 -sign 1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 32 FP additions, 18 FP multiplications, | |
* (or, 14 additions, 0 multiplications, 18 fused multiply/add), | |
* 41 stack variables, 4 constants, and 20 memory accesses | |
*/ | |
DEVICE void | |
dft5 (real2_t * u0, real2_t * u1, real2_t * u2, real2_t * u3, real2_t * u4) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
real2_t v2 = *u2; | |
real2_t v3 = *u3; | |
real2_t v4 = *u4; | |
{ | |
const real_t KP951056516 = | |
+0.951056516295153572116439333379382143405698634; | |
const real_t KP559016994 = | |
+0.559016994374947424102293417182819058860154590; | |
const real_t KP250000000 = | |
+0.250000000000000000000000000000000000000000000; | |
const real_t KP618033988 = | |
+0.618033988749894848204586834365638117720309180; | |
{ | |
real_t Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv; | |
{ | |
real_t Tl, T1, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9; | |
Tl = v0.y; | |
T1 = v0.x; | |
{ | |
real_t Tc, T2, T3, Td, Tf, T5, T6, Tg, T4, T7; | |
Tc = v1.y; | |
T2 = v1.x; | |
T3 = v4.x; | |
Td = v4.y; | |
Tf = v2.y; | |
T5 = v2.x; | |
T6 = v3.x; | |
Tg = v3.y; | |
Ts = T2 - T3; | |
T4 = T2 + T3; | |
Tt = T5 - T6; | |
T7 = T5 + T6; | |
T8 = T4 + T7; | |
Ta = T4 - T7; | |
Te = Tc - Td; | |
Tm = Tc + Td; | |
Tn = Tf + Tg; | |
Th = Tf - Tg; | |
} | |
To = Tm + Tn; | |
Tq = Tm - Tn; | |
Ti = fma (KP618033988, Th, Te); | |
Tk = fma (-KP618033988, Te, Th); | |
v0.y = Tl + To; | |
v0.x = T1 + T8; | |
T9 = fma (-KP250000000, T8, T1); | |
Tu = fma (KP618033988, Tt, Ts); | |
Tw = fma (-KP618033988, Ts, Tt); | |
Tp = fma (-KP250000000, To, Tl); | |
Tb = fma (KP559016994, Ta, T9); | |
Tj = fma (-KP559016994, Ta, T9); | |
} | |
Tr = fma (KP559016994, Tq, Tp); | |
Tv = fma (-KP559016994, Tq, Tp); | |
v2.x = fma (KP951056516, Tk, Tj); | |
v2.y = fma (-KP951056516, Tw, Tv); | |
v3.y = fma (KP951056516, Tw, Tv); | |
v3.x = fma (-KP951056516, Tk, Tj); | |
v4.x = fma (KP951056516, Ti, Tb); | |
v4.y = fma (-KP951056516, Tu, Tr); | |
v1.y = fma (KP951056516, Tu, Tr); | |
v1.x = fma (-KP951056516, Ti, Tb); | |
} | |
} | |
*u0 = v0; | |
*u1 = v1; | |
*u2 = v2; | |
*u3 = v3; | |
*u4 = v4; | |
} | |
kernel void radix | |
( | |
global const double2 * x, | |
global double2 * y, | |
uint p, | |
uint threads | |
) | |
{ | |
const size_t i = get_global_id(0); | |
if(i >= threads) return; | |
const size_t k = i % p; | |
const size_t batch_offset = get_global_id(1) * threads * 5; | |
x += i + batch_offset; | |
double2 v0 = x[0 * threads]; | |
double2 v1 = x[1 * threads]; | |
double2 v2 = x[2 * threads]; | |
double2 v3 = x[3 * threads]; | |
double2 v4 = x[4 * threads]; | |
if(p != 1) | |
{ | |
v1 = mul(v1, twiddle((double)-1.256637061435917 * k / p)); | |
v2 = mul(v2, twiddle((double)-2.513274122871834 * k / p)); | |
v3 = mul(v3, twiddle((double)-3.769911184307752 * k / p)); | |
v4 = mul(v4, twiddle((double)-5.026548245743669 * k / p)); | |
} | |
dft5(&v0, &v1, &v2, &v3, &v4); | |
const size_t j = k + (i - k) * 5; | |
y += j + batch_offset; | |
y[0 * p] = v0; | |
y[1 * p] = v1; | |
y[2 * p] = v2; | |
y[3 * p] = v3; | |
y[4 * p] = v4; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 scale | |
( | |
double2 x, | |
double a | |
) | |
{ | |
double2 r = {x.x * a, x.y * a}; | |
return r; | |
} | |
kernel void bluestein_mul_out | |
( | |
global const double2 * data, | |
global const double2 * exp, | |
global double2 * output, | |
double div, | |
uint p, | |
uint in_stride, | |
uint radix | |
) | |
{ | |
const size_t i = get_global_id(0); | |
const size_t threads = get_global_size(0); | |
const size_t b = get_global_id(1); | |
const size_t l = get_global_id(2); | |
if(l < radix) | |
{ | |
const size_t k = i % p; | |
const size_t j = k + (i - k) * radix; | |
const size_t in_off = i * in_stride + b * in_stride * threads + l; | |
const size_t out_off = j + b * threads * radix + l * p; | |
output[out_off] = mul(scale(data[in_off], div), exp[l]); | |
} | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
kernel void transpose | |
( | |
global const double2 * input, | |
global double2 * output, | |
uint width, | |
uint height | |
) | |
{ | |
const size_t global_x = get_global_id(0); | |
const size_t global_y = get_global_id(1); | |
const size_t local_x = get_local_id(0); | |
const size_t local_y = get_local_id(1); | |
const size_t group_x = get_group_id(0); | |
const size_t group_y = get_group_id(1); | |
const size_t target_x = local_y + group_y * 32; | |
const size_t target_y = local_x + group_x * 32; | |
const bool range = global_x < width && global_y < height; | |
local double2 block[1024]; | |
if(range) block[local_x + local_y * 32] = input[global_x + global_y * width]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(range) output[target_x + target_y * height] = block[local_x + local_y * 32]; | |
} | |
#define DEVICE | |
#if defined(cl_khr_fp64) | |
# pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#elif defined(cl_amd_fp64) | |
# pragma OPENCL EXTENSION cl_amd_fp64: enable | |
#endif | |
typedef double real_t; | |
typedef double2 real2_t; | |
double2 mul | |
( | |
double2 a, | |
double2 b | |
) | |
{ | |
double2 r = {a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}; | |
return r; | |
} | |
double2 twiddle | |
( | |
double alpha | |
) | |
{ | |
double cs, sn = sincos(alpha, &cs); | |
double2 r = {cs, sn}; | |
return r; | |
}/* Generated by: ./cl_gen_notw.native -n 2 -name dft2 -sign -1 -compact -fma -reorder-insns -reorder-loads -reorder-stores -schedule-for-pipeline -pipeline-latency 4 -standalone */ | |
/* | |
* This function contains 4 FP additions, 0 FP multiplications, | |
* (or, 4 additions, 0 multiplications, 0 fused multiply/add), | |
* 6 stack variables, 0 constants, and 8 memory accesses | |
*/ | |
DEVICE void | |
dft2 (real2_t * u0, real2_t * u1) | |
{ | |
real2_t v0 = *u0; | |
real2_t v1 = *u1; | |
{ | |
{ | |
real_t T3, T1, T2, T4; | |
T3 = v0.y; | |
T1 = v0.x; | |
T2 = v1.x; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment