adituv/dotprod.c

## dotprod.c
//The code that runs on the graphics card is included in the program here as a plaintext string
//It's also possible to include it as a binary, but I don't know how to do that.
const char* source=
"__kernel                                                                    \n"
"void sum(__global float* source, __global float* dest, uint n)              \n"
"  int idx = get_global_id(0);                                               \n"
"  int iter = 2;                                                             \n"
"                                                                            \n"
"  while(iter < n && idx % iter == 0) {                                      \n"
"    dest[min(iter*idx,n)] = source[min(iter*idx,n)] + source[(iter-1)*idx]; \n"
"    iter *= 2;                                                              \n"
"  }                                                                         \n"
"}                                                                           \n"
;

//Find the sum of the first n elements of input
//Input is a buffer containing n floats on the graphics card
float sum(input, n) {

  //Allocate memory on the device to store the result
  output = createbuffer(sizeof (float), n)

  program = createGPUProgram(&source)
  buildGPUProgram(program)

  //How parallel and in how many dimensions the code is running
  //Here, the code is running in one dimension of size the nearest
  //power of 2 greater than n
  dimensions = 2^(ceil(log(n)))

  //The kernel is just the function executing on the graphics card
  kernel = createKernel(source, 1, dimensions, "sum")
  setKernelArgument(kernel, 0, input)
  setKernelArgument(kernel, 1, output)
  setKernelArgument(kernel, 2, n)

  runKernel(kernel)

  waitForKernel()

  //Read back graphics card memory to main memory
  result = getbuffercontents(output)

  //The tree of partial sums has the root node at the last index
  //return this result for the full sum
  return result[n-1]

}
	//The code that runs on the graphics card is included in the program here as a plaintext string
	//It's also possible to include it as a binary, but I don't know how to do that.
	const char* source=
	"__kernel \n"
	"void sum(__global float* source, __global float* dest, uint n) \n"
	" int idx = get_global_id(0); \n"
	" int iter = 2; \n"
	" \n"
	" while(iter < n && idx % iter == 0) { \n"
	" dest[min(iteridx,n)] = source[min(iteridx,n)] + source[(iter-1)*idx]; \n"
	" iter *= 2; \n"
	" } \n"
	"} \n"
	;

	//Find the sum of the first n elements of input
	//Input is a buffer containing n floats on the graphics card
	float sum(input, n) {

	//Allocate memory on the device to store the result
	output = createbuffer(sizeof (float), n)

	program = createGPUProgram(&source)
	buildGPUProgram(program)

	//How parallel and in how many dimensions the code is running
	//Here, the code is running in one dimension of size the nearest
	//power of 2 greater than n
	dimensions = 2^(ceil(log(n)))

	//The kernel is just the function executing on the graphics card
	kernel = createKernel(source, 1, dimensions, "sum")
	setKernelArgument(kernel, 0, input)
	setKernelArgument(kernel, 1, output)
	setKernelArgument(kernel, 2, n)

	runKernel(kernel)

	waitForKernel()

	//Read back graphics card memory to main memory
	result = getbuffercontents(output)

	//The tree of partial sums has the root node at the last index
	//return this result for the full sum
	return result[n-1]

	}