Skip to content

Instantly share code, notes, and snippets.

@ddemidov
Last active August 29, 2015 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ddemidov/8ed870ffa0cb409ae6a1 to your computer and use it in GitHub Desktop.
Save ddemidov/8ed870ffa0cb409ae6a1 to your computer and use it in GitHub Desktop.
pi: pi.cpp
g++ -o pi pi.cpp -std=c++11 -I${VEXCL_ROOT} -lOpenCL -lboost_system
$ ./pi
1. Capeverde (AMD Accelerated Parallel Processing)
pi: 3.1414
$ export VEXCL_SHOW_KERNELS=1
$ ./pi
1. Capeverde (AMD Accelerated Parallel Processing)
#if defined(cl_khr_fp64)
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif defined(cl_amd_fp64)
# pragma OPENCL EXTENSION cl_amd_fp64: enable
#endif
ulong SUM_ulong
(
ulong prm1,
ulong prm2
)
{
return prm1 + prm2;
}
void threefry_uint_4_20
(
uint * ctr,
uint * key
)
{
const uint p = 0x1BD11BDA ^ key[0] ^ key[1] ^ key[2] ^ key[3];
ctr[0] += key[0];
ctr[1] += key[1];
ctr[2] += key[2];
ctr[3] += key[3];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 10u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 26u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 21u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 11u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 27u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 5u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 23u); ctr[3] ^= ctr[2];
ctr[0] += key[1];
ctr[1] += key[2];
ctr[2] += key[3];
ctr[3] += p; ctr[3] += 1;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 20u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 11u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 17u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 25u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 10u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 20u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 18u); ctr[3] ^= ctr[2];
ctr[0] += key[2];
ctr[1] += key[3];
ctr[2] += p;
ctr[3] += key[0]; ctr[3] += 2;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 10u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 26u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 21u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 11u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 27u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 5u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 23u); ctr[3] ^= ctr[2];
ctr[0] += key[3];
ctr[1] += p;
ctr[2] += key[0];
ctr[3] += key[1]; ctr[3] += 3;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 6u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 20u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 11u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 17u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 25u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 10u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 20u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 18u); ctr[3] ^= ctr[2];
ctr[0] += p;
ctr[1] += key[0];
ctr[2] += key[1];
ctr[3] += key[2]; ctr[3] += 4;
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 10u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 26u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 21u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 11u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 13u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 27u); ctr[3] ^= ctr[2];
ctr[0] += ctr[1]; ctr[1] = rotate(ctr[1], 5u); ctr[1] ^= ctr[0];
ctr[2] += ctr[3]; ctr[3] = rotate(ctr[3], 23u); ctr[3] ^= ctr[2];
ctr[0] += key[0];
ctr[1] += key[1];
ctr[2] += key[2];
ctr[3] += key[3]; ctr[3] += 5;
}
double2 random_double2_threefry
(
ulong prm1,
ulong prm2
)
{
union
{
uint ctr[4];
ulong res_i[2];
double res_f[2];
double2 res;
} ctr;
uint key[4];
ctr.ctr[0] = prm1; ctr.ctr[1] = prm2;
ctr.ctr[2] = prm1; ctr.ctr[3] = prm2;
key[0] = 0x12345678;
key[1] = 0x12345678;
key[2] = 0x12345678;
key[3] = 0x12345678;
threefry_uint_4_20(ctr.ctr, key);
ctr.res_f[0] = ctr.res_i[0] / 18446744073709551615.0;
ctr.res_f[1] = ctr.res_i[1] / 18446744073709551615.0;
return ctr.res;
}
kernel void vexcl_reductor_kernel
(
ulong n,
ulong prm_1,
ulong prm_2,
double prm_3,
global ulong * g_odata,
local ulong * smem
)
{
ulong mySum = (ulong)0;
for(ulong idx = get_global_id(0); idx < n; idx += get_global_size(0))
{
mySum = SUM_ulong(mySum, ( length( random_double2_threefry( (prm_1 + idx), prm_2 ) ) < prm_3 ));
}
local ulong * sdata = smem;
size_t tid = get_local_id(0);
size_t block_size = get_local_size(0);
sdata[tid] = mySum;
barrier(CLK_LOCAL_MEM_FENCE);
if (block_size >= 1024)
{
if (tid < 512) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 512]); }
barrier(CLK_LOCAL_MEM_FENCE);
}
if (block_size >= 512)
{
if (tid < 256) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 256]); }
barrier(CLK_LOCAL_MEM_FENCE);
}
if (block_size >= 256)
{
if (tid < 128) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 128]); }
barrier(CLK_LOCAL_MEM_FENCE);
}
if (block_size >= 128)
{
if (tid < 64) { sdata[tid] = mySum = SUM_ulong(mySum, sdata[tid + 64]); }
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid < 32)
{
volatile local ulong * smem = sdata;
if (block_size >= 64) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 32]); }
if (block_size >= 32) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 16]); }
if (block_size >= 16) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 8]); }
if (block_size >= 8) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 4]); }
if (block_size >= 4) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 2]); }
if (block_size >= 2) { smem[tid] = mySum = SUM_ulong(mySum, smem[tid + 1]); }
}
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
}
pi: 3.14303
#include <iostream>
#include <random>
#include <vexcl/vexcl.hpp>
int main(int argc, char *argv[]) {
const size_t n = argc > 1 ? std::stoi(argv[1]) : 1024 * 1024;
const size_t seed = std::random_device()();
vex::Context ctx(vex::Filter::Env);
std::cout << ctx << std::endl;
// Create a random generator
vex::Random<
cl_double2, // value type to generate
vex::random::threefry // generator type
> rnd;
// Estimate pi with Monte-Carlo method:
vex::Reductor<size_t, vex::SUM> sum(ctx);
double pi = 4.0 * sum( length( rnd(vex::element_index(0, n), seed) ) < 1.0 ) / n;
std::cout << "pi: " << pi << std::endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment