Created
March 2, 2013 03:49
-
-
Save soloman817/5069624 to your computer and use it in GitHub Desktop.
segmented scan tests
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open NUnit.Framework | |
open Alea.Interop.LLVM | |
open Alea.CUDA | |
open Alea.CUDA.Extension | |
// define an alias | |
module IRB = Alea.CUDA.IRBuilderUtil | |
module DF = Alea.CUDA.DeviceFunction | |
// get the worker | |
let worker = getDefaultWorker() | |
[<IRB.LLVMFunctionBuilder>] | |
let bfi(x:int, y:int, bit:int, numBits:int):int = failwith "Device Only!" | |
let ``bfi [BUILDER]``(ctx:IRB.LLVMFunctionBuilderContext) = | |
let args = ctx.LLVMValueArgs // arguments LLVM values | |
let i32t = ctx.LLVMHelper.i32_t // int LLVM type | |
let rett = i32t // return type | |
let argst = [| i32t; i32t; i32t; i32t |] // argument type list | |
let funct = LLVMFunctionTypeEx(rett, argst, 0) | |
let funcp = LLVMConstInlineAsm(funct, "bfi.b32 \t$0, $2, $1, $3, $4;", "=r,r,r,r,r", 0, 0) | |
IRB.Value(LLVMBuildCallEx(ctx.Builder, funcp, args, "")) | |
[<ReflectedDefinition>] | |
let segscanWarp (inputs:DevicePtr<int>) (outputs:DevicePtr<int>) (distances:DevicePtr<int>) = | |
let tid = threadIdx.x | |
let packed = inputs.[tid] | |
// the start flag is in the high bit | |
let flag = 0x80000000 &&& packed | |
// get the start flags for each thread in the warp | |
let flags = __ballot(flag) | |
// mask out the bits above the current thread | |
let flags = flags &&& bfi(0, 0xffffffff, 0, tid + 1) | |
// find the distance from the current thread to the thread at the start of | |
// the segment | |
let distance = DF.__clz(flags) + tid - 31 | |
let shared = __shared__<int>(Util.WARP_SIZE).Ptr(0).Volatile() | |
let x0 = 0x7fffffff &&& packed | |
let mutable x = x0 | |
shared.[tid] <- x | |
// perform the parallel scan. Note the conditional if(offset < distance) | |
// replaces the ordinary scan conditional if(offset <= tid) | |
for i = 0 to Util.LOG_WARP_SIZE - 1 do | |
let offset = 1 <<< i | |
if offset <= distance then x <- x + shared.[tid - offset] | |
shared.[tid] <- x | |
// turn inclusive scan into exclusive scan | |
x <- x - x0 | |
outputs.[tid] <- x | |
distances.[tid] <- distance | |
[<Test>] | |
let test() = | |
let blockSize = 256 | |
let numWarps = blockSize / Util.WARP_SIZE | |
let s x = x ||| (1 <<< 31) | |
let hInputs = | |
[| | |
s 3; 0; 3; 3; 0; s 1; 2; 0; 3; 3; 3; 2; 3; 0; 3; 1; | |
0; 0; 2; 3; 2; s 3; 1; 0; 2; 1; 2; 1; 1; 0; 1; s 3; | |
|] | |
let pfunct = cuda { | |
let! segscanWarp = <@ segscanWarp @> |> defineKernelFunc | |
return PFunc(fun (m:Module) -> | |
let worker = m.Worker | |
let segscanWarp = segscanWarp.Apply m | |
pcalc { | |
let! dInputs = DArray.scatterInBlob worker hInputs | |
let! dOutputs = DArray.createInBlob worker hInputs.Length | |
let! dDistances = DArray.createInBlob worker hInputs.Length | |
do! PCalc.action (fun hint -> | |
let lp = LaunchParam(1, Util.WARP_SIZE) |> hint.ModifyLaunchParam | |
segscanWarp.Launch lp dInputs.Ptr dOutputs.Ptr dDistances.Ptr) | |
let! hOutputs = dOutputs.Gather() | |
let! hDistances = dDistances.Gather() | |
printfn "outputs:" | |
for i = 0 to 15 do printf "%2d; " hOutputs.[i] | |
printfn "" | |
for i = 16 to 31 do printf "%2d; " hOutputs.[i] | |
printfn "" | |
printfn "distances:" | |
for i = 0 to 15 do printf "%2d; " hDistances.[i] | |
printfn "" | |
for i = 16 to 31 do printf "%2d; " hDistances.[i] | |
printfn "" } ) } | |
let calc = worker.LoadPModule(pfunct).Invoke | |
calc |> PCalc.run |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment