Skip to content

Instantly share code, notes, and snippets.

@kxbmap
Created March 28, 2011 19:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kxbmap/891043 to your computer and use it in GitHub Desktop.
Save kxbmap/891043 to your computer and use it in GitHub Desktop.
package com.example.hellojocl
import java.nio.FloatBuffer
import util.Random
import com.jogamp.opencl.{CLBuffer, CLContext}
class HelloJOCL(ctx: CLContext) {
import HelloJOCL._
// select fastest device
lazy val device = {
val dev = ctx.getMaxFlopsDevice
println("using " + dev)
dev
}
// create command queue on device.
lazy val queue = device.createCommandQueue()
// Length of arrays to process
val elementCount = 1444477
// Local work size dimensions
lazy val localWorkSize = math.min(device.getMaxWorkGroupSize, 256)
// rounded up to the nearest multiple of the localWorkSize
lazy val globalWorkSize = roundUp(localWorkSize, elementCount)
// load sources, create and build program
lazy val program = ctx.createProgram(classOf[HelloJOCL].getResourceAsStream("VectorAdd.cl")).build()
// A, B are input buffers, C is for the result
// fill input buffers with random numbers
// (just to have test data; seed is fixed -> results will not change between runs).
import com.jogamp.opencl.CLMemory.Mem.{READ_ONLY, WRITE_ONLY}
lazy val clBufferA = fillBuffer(ctx.createFloatBuffer(globalWorkSize, READ_ONLY), 12345)
lazy val clBufferB = fillBuffer(ctx.createFloatBuffer(globalWorkSize, READ_ONLY), 67890)
lazy val clBufferC = ctx.createFloatBuffer(globalWorkSize, WRITE_ONLY)
// get a reference to the kernel function with the name 'VectorAdd'
// and map the buffers to its input parameters.
lazy val kernel = {
println("used device memory: " +
(clBufferA.getCLSize + clBufferB.getCLSize + clBufferC.getCLSize) / 1024 / 1024 + "MiB")
println("localWorkSize: " + localWorkSize + ", globalWorkSize: " + globalWorkSize)
program.createCLKernel("VectorAdd")
.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount)
}
def run() = {
// asynchronous write of data to GPU device,
// followed by blocking read to get the computed results back.
queue
.putWriteBuffer(clBufferA, false)
.putWriteBuffer(clBufferB, false)
.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
.putReadBuffer(clBufferC, true)
val ret = clBufferC.getBuffer
ret.rewind()
ret
}
}
object HelloJOCL {
def main(args: Array[String]){
val ctx = CLContext.create()
println("created " + ctx)
try {
val hello = new HelloJOCL(ctx)
for (i <- 1 to 10) {
val startTime = System.nanoTime
val ret = hello.run()
val endTime = System.nanoTime
if (i == 1) {
// print first few elements of the resulting buffer to the console.
println("a+b=c results snapshot: ")
for (i <- 0 until 10) {
print(ret.get() + ", ")
}
println("...; " + ret.remaining + " more")
}
println("computation took %2d: %d micro sec" format (i, (endTime - startTime) / 1000))
}
} finally ctx.release()
}
def fillBuffer(clBuf: CLBuffer[FloatBuffer], seed: Int) = {
def nextFloats(size: Int) = Array.fill(size)(Random.nextFloat() * 100)
val buffer = clBuf.getBuffer
Random.setSeed(seed)
buffer.put(nextFloats(buffer.remaining)).rewind()
clBuf
}
def roundUp(groupSize: Int, globalSize: Int) = {
val r = globalSize % groupSize
if (r == 0) globalSize
else globalSize + groupSize - r
}
}
package com.example.hellojocl
import util.Random
class HelloScala {
val size = 1444477
lazy val arrayA = {
Random.setSeed(12345)
Array.fill(size)(Random.nextFloat * 100)
}
lazy val arrayB = {
Random.setSeed(67890)
Array.fill(size)(Random.nextFloat * 100)
}
lazy val arrayC = Array.ofDim[Float](size)
def run(r: Int => Seq[Int]) {
val seq = r(size)
val t = for(n <- 1 to 10) yield {
val startTime = System.nanoTime
for (i <- seq) {
arrayC(i) = arrayA(i) + arrayB(i)
}
val endTime = System.nanoTime
System.gc()
(endTime - startTime) / 1000
}
println("a+b=c results snapshot: ")
for (i <- 0 until 10) {
print(arrayC(i) + ", ")
}
println("...; " + (arrayC.length - 10) + " more")
t map ("computation took: " + _ + " micro sec") foreach println
}
}
object HelloScala {
def main(args: Array[String]) {
println("Parallel - availableProcessors: " + scala.collection.parallel.availableProcessors)
new HelloScala().run(0 until _ par)
println("Linear")
new HelloScala().run(0 until _)
}
}
// OpenCL Kernel Function for element by element vector addition
kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) {
// get index into global data array
int iGID = get_global_id(0);
// bound check, equivalent to the limit on a 'for' loop
if (iGID >= numElements) {
return;
}
// add the vector elements
c[iGID] = a[iGID] + b[iGID];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment