kxbmap/HelloJOCL.scala

## HelloJOCL.scala
package com.example.hellojocl

import java.nio.FloatBuffer
import util.Random
import com.jogamp.opencl.{CLBuffer, CLContext}


class HelloJOCL(ctx: CLContext) {
  import HelloJOCL._

  // select fastest device
  lazy val device = {
    val dev = ctx.getMaxFlopsDevice
    println("using " + dev)
    dev
  }

  // create command queue on device.
  lazy val queue = device.createCommandQueue()

  // Length of arrays to process
  val elementCount = 1444477

  // Local work size dimensions
  lazy val localWorkSize = math.min(device.getMaxWorkGroupSize, 256)

  // rounded up to the nearest multiple of the localWorkSize
  lazy val globalWorkSize = roundUp(localWorkSize, elementCount)

  // load sources, create and build program
  lazy val program = ctx.createProgram(classOf[HelloJOCL].getResourceAsStream("VectorAdd.cl")).build()

  // A, B are input buffers, C is for the result
  // fill input buffers with random numbers
  // (just to have test data; seed is fixed -> results will not change between runs).
  import com.jogamp.opencl.CLMemory.Mem.{READ_ONLY, WRITE_ONLY}
  lazy val clBufferA = fillBuffer(ctx.createFloatBuffer(globalWorkSize, READ_ONLY), 12345)
  lazy val clBufferB = fillBuffer(ctx.createFloatBuffer(globalWorkSize, READ_ONLY), 67890)
  lazy val clBufferC = ctx.createFloatBuffer(globalWorkSize, WRITE_ONLY)

  // get a reference to the kernel function with the name 'VectorAdd'
  // and map the buffers to its input parameters.
  lazy val kernel = {
    println("used device memory: " +
      (clBufferA.getCLSize + clBufferB.getCLSize + clBufferC.getCLSize) / 1024 / 1024 + "MiB")

    println("localWorkSize: " + localWorkSize + ", globalWorkSize: " + globalWorkSize)

    program.createCLKernel("VectorAdd")
      .putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount)
  }

  def run() = {
    // asynchronous write of data to GPU device,
    // followed by blocking read to get the computed results back.
    queue
      .putWriteBuffer(clBufferA, false)
      .putWriteBuffer(clBufferB, false)
      .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
      .putReadBuffer(clBufferC, true)

    val ret = clBufferC.getBuffer
    ret.rewind()
    ret
  }

}

object HelloJOCL {

  def main(args: Array[String]){
    val ctx = CLContext.create()
    println("created " + ctx)

    try {
      val hello = new HelloJOCL(ctx)

      for (i <- 1 to 10) {
        val startTime = System.nanoTime
        val ret = hello.run()
        val endTime = System.nanoTime

        if (i == 1) {
          // print first few elements of the resulting buffer to the console.
          println("a+b=c results snapshot: ")
          for (i <- 0 until 10) {
            print(ret.get() + ", ")
          }
          println("...; " + ret.remaining + " more")
        }
        println("computation took %2d: %d micro sec" format (i, (endTime - startTime) / 1000))
      }
    } finally ctx.release()
  }

  def fillBuffer(clBuf: CLBuffer[FloatBuffer], seed: Int) = {
    def nextFloats(size: Int) = Array.fill(size)(Random.nextFloat() * 100)

    val buffer = clBuf.getBuffer
    Random.setSeed(seed)
    buffer.put(nextFloats(buffer.remaining)).rewind()
    clBuf
  }

  def roundUp(groupSize: Int, globalSize: Int) = {
    val r = globalSize % groupSize
    if (r == 0) globalSize
    else globalSize + groupSize - r
  }

}

## HelloScala.scala
package com.example.hellojocl

import util.Random


class HelloScala {
  val size = 1444477

  lazy val arrayA = {
    Random.setSeed(12345)
    Array.fill(size)(Random.nextFloat * 100)
  }

  lazy val arrayB = {
    Random.setSeed(67890)
    Array.fill(size)(Random.nextFloat * 100)
  }

  lazy val arrayC = Array.ofDim[Float](size)

  def run(r: Int => Seq[Int]) {
    val seq = r(size)
    val t = for(n <- 1 to 10) yield {
      val startTime = System.nanoTime
      for (i <- seq) {
        arrayC(i) = arrayA(i) + arrayB(i)
      }
      val endTime = System.nanoTime

      System.gc()

      (endTime - startTime) / 1000
    }

    println("a+b=c results snapshot: ")
    for (i <- 0 until 10) {
      print(arrayC(i) + ", ")
    }
    println("...; " + (arrayC.length - 10) + " more")

    t map ("computation took: " + _ + " micro sec") foreach println
  }
}

object HelloScala {
  def main(args: Array[String]) {
    println("Parallel - availableProcessors: " + scala.collection.parallel.availableProcessors)
    new HelloScala().run(0 until _ par)
    println("Linear")
    new HelloScala().run(0 until _)
  }
}

## VectorAdd.cl
// OpenCL Kernel Function for element by element vector addition
kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) {

    // get index into global data array
    int iGID = get_global_id(0);

    // bound check, equivalent to the limit on a 'for' loop
    if (iGID >= numElements)  {
        return;
    }

    // add the vector elements
    c[iGID] = a[iGID] + b[iGID];
}
	package com.example.hellojocl

	import java.nio.FloatBuffer
	import util.Random
	import com.jogamp.opencl.{CLBuffer, CLContext}


	class HelloJOCL(ctx: CLContext) {
	import HelloJOCL._

	// select fastest device
	lazy val device = {
	val dev = ctx.getMaxFlopsDevice
	println("using " + dev)
	dev
	}

	// create command queue on device.
	lazy val queue = device.createCommandQueue()

	// Length of arrays to process
	val elementCount = 1444477

	// Local work size dimensions
	lazy val localWorkSize = math.min(device.getMaxWorkGroupSize, 256)

	// rounded up to the nearest multiple of the localWorkSize
	lazy val globalWorkSize = roundUp(localWorkSize, elementCount)

	// load sources, create and build program
	lazy val program = ctx.createProgram(classOf[HelloJOCL].getResourceAsStream("VectorAdd.cl")).build()

	// A, B are input buffers, C is for the result
	// fill input buffers with random numbers
	// (just to have test data; seed is fixed -> results will not change between runs).
	import com.jogamp.opencl.CLMemory.Mem.{READ_ONLY, WRITE_ONLY}
	lazy val clBufferA = fillBuffer(ctx.createFloatBuffer(globalWorkSize, READ_ONLY), 12345)
	lazy val clBufferB = fillBuffer(ctx.createFloatBuffer(globalWorkSize, READ_ONLY), 67890)
	lazy val clBufferC = ctx.createFloatBuffer(globalWorkSize, WRITE_ONLY)

	// get a reference to the kernel function with the name 'VectorAdd'
	// and map the buffers to its input parameters.
	lazy val kernel = {
	println("used device memory: " +
	(clBufferA.getCLSize + clBufferB.getCLSize + clBufferC.getCLSize) / 1024 / 1024 + "MiB")

	println("localWorkSize: " + localWorkSize + ", globalWorkSize: " + globalWorkSize)

	program.createCLKernel("VectorAdd")
	.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount)
	}

	def run() = {
	// asynchronous write of data to GPU device,
	// followed by blocking read to get the computed results back.
	queue
	.putWriteBuffer(clBufferA, false)
	.putWriteBuffer(clBufferB, false)
	.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
	.putReadBuffer(clBufferC, true)

	val ret = clBufferC.getBuffer
	ret.rewind()
	ret
	}

	}

	object HelloJOCL {

	def main(args: Array[String]){
	val ctx = CLContext.create()
	println("created " + ctx)

	try {
	val hello = new HelloJOCL(ctx)

	for (i <- 1 to 10) {
	val startTime = System.nanoTime
	val ret = hello.run()
	val endTime = System.nanoTime

	if (i == 1) {
	// print first few elements of the resulting buffer to the console.
	println("a+b=c results snapshot: ")
	for (i <- 0 until 10) {
	print(ret.get() + ", ")
	}
	println("...; " + ret.remaining + " more")
	}
	println("computation took %2d: %d micro sec" format (i, (endTime - startTime) / 1000))
	}
	} finally ctx.release()
	}

	def fillBuffer(clBuf: CLBuffer[FloatBuffer], seed: Int) = {
	def nextFloats(size: Int) = Array.fill(size)(Random.nextFloat() * 100)

	val buffer = clBuf.getBuffer
	Random.setSeed(seed)
	buffer.put(nextFloats(buffer.remaining)).rewind()
	clBuf
	}

	def roundUp(groupSize: Int, globalSize: Int) = {
	val r = globalSize % groupSize
	if (r == 0) globalSize
	else globalSize + groupSize - r
	}

	}
	package com.example.hellojocl

	import util.Random


	class HelloScala {
	val size = 1444477

	lazy val arrayA = {
	Random.setSeed(12345)
	Array.fill(size)(Random.nextFloat * 100)
	}

	lazy val arrayB = {
	Random.setSeed(67890)
	Array.fill(size)(Random.nextFloat * 100)
	}

	lazy val arrayC = Array.ofDim[Float](size)

	def run(r: Int => Seq[Int]) {
	val seq = r(size)
	val t = for(n <- 1 to 10) yield {
	val startTime = System.nanoTime
	for (i <- seq) {
	arrayC(i) = arrayA(i) + arrayB(i)
	}
	val endTime = System.nanoTime

	System.gc()

	(endTime - startTime) / 1000
	}

	println("a+b=c results snapshot: ")
	for (i <- 0 until 10) {
	print(arrayC(i) + ", ")
	}
	println("...; " + (arrayC.length - 10) + " more")

	t map ("computation took: " + _ + " micro sec") foreach println
	}
	}

	object HelloScala {
	def main(args: Array[String]) {
	println("Parallel - availableProcessors: " + scala.collection.parallel.availableProcessors)
	new HelloScala().run(0 until _ par)
	println("Linear")
	new HelloScala().run(0 until _)
	}
	}
	// OpenCL Kernel Function for element by element vector addition
	kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) {

	// get index into global data array
	int iGID = get_global_id(0);

	// bound check, equivalent to the limit on a 'for' loop
	if (iGID >= numElements) {
	return;
	}

	// add the vector elements
	c[iGID] = a[iGID] + b[iGID];
	}