Skip to content

Instantly share code, notes, and snippets.

@soloman817
Created March 2, 2013 03:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save soloman817/5069624 to your computer and use it in GitHub Desktop.
Save soloman817/5069624 to your computer and use it in GitHub Desktop.
segmented scan tests
open NUnit.Framework
open Alea.Interop.LLVM
open Alea.CUDA
open Alea.CUDA.Extension
// define an alias
module IRB = Alea.CUDA.IRBuilderUtil
module DF = Alea.CUDA.DeviceFunction
// get the worker
let worker = getDefaultWorker()
[<IRB.LLVMFunctionBuilder>]
let bfi(x:int, y:int, bit:int, numBits:int):int = failwith "Device Only!"
let ``bfi [BUILDER]``(ctx:IRB.LLVMFunctionBuilderContext) =
let args = ctx.LLVMValueArgs // arguments LLVM values
let i32t = ctx.LLVMHelper.i32_t // int LLVM type
let rett = i32t // return type
let argst = [| i32t; i32t; i32t; i32t |] // argument type list
let funct = LLVMFunctionTypeEx(rett, argst, 0)
let funcp = LLVMConstInlineAsm(funct, "bfi.b32 \t$0, $2, $1, $3, $4;", "=r,r,r,r,r", 0, 0)
IRB.Value(LLVMBuildCallEx(ctx.Builder, funcp, args, ""))
[<ReflectedDefinition>]
let segscanWarp (inputs:DevicePtr<int>) (outputs:DevicePtr<int>) (distances:DevicePtr<int>) =
let tid = threadIdx.x
let packed = inputs.[tid]
// the start flag is in the high bit
let flag = 0x80000000 &&& packed
// get the start flags for each thread in the warp
let flags = __ballot(flag)
// mask out the bits above the current thread
let flags = flags &&& bfi(0, 0xffffffff, 0, tid + 1)
// find the distance from the current thread to the thread at the start of
// the segment
let distance = DF.__clz(flags) + tid - 31
let shared = __shared__<int>(Util.WARP_SIZE).Ptr(0).Volatile()
let x0 = 0x7fffffff &&& packed
let mutable x = x0
shared.[tid] <- x
// perform the parallel scan. Note the conditional if(offset < distance)
// replaces the ordinary scan conditional if(offset <= tid)
for i = 0 to Util.LOG_WARP_SIZE - 1 do
let offset = 1 <<< i
if offset <= distance then x <- x + shared.[tid - offset]
shared.[tid] <- x
// turn inclusive scan into exclusive scan
x <- x - x0
outputs.[tid] <- x
distances.[tid] <- distance
[<Test>]
let test() =
let blockSize = 256
let numWarps = blockSize / Util.WARP_SIZE
let s x = x ||| (1 <<< 31)
let hInputs =
[|
s 3; 0; 3; 3; 0; s 1; 2; 0; 3; 3; 3; 2; 3; 0; 3; 1;
0; 0; 2; 3; 2; s 3; 1; 0; 2; 1; 2; 1; 1; 0; 1; s 3;
|]
let pfunct = cuda {
let! segscanWarp = <@ segscanWarp @> |> defineKernelFunc
return PFunc(fun (m:Module) ->
let worker = m.Worker
let segscanWarp = segscanWarp.Apply m
pcalc {
let! dInputs = DArray.scatterInBlob worker hInputs
let! dOutputs = DArray.createInBlob worker hInputs.Length
let! dDistances = DArray.createInBlob worker hInputs.Length
do! PCalc.action (fun hint ->
let lp = LaunchParam(1, Util.WARP_SIZE) |> hint.ModifyLaunchParam
segscanWarp.Launch lp dInputs.Ptr dOutputs.Ptr dDistances.Ptr)
let! hOutputs = dOutputs.Gather()
let! hDistances = dDistances.Gather()
printfn "outputs:"
for i = 0 to 15 do printf "%2d; " hOutputs.[i]
printfn ""
for i = 16 to 31 do printf "%2d; " hOutputs.[i]
printfn ""
printfn "distances:"
for i = 0 to 15 do printf "%2d; " hDistances.[i]
printfn ""
for i = 16 to 31 do printf "%2d; " hDistances.[i]
printfn "" } ) }
let calc = worker.LoadPModule(pfunct).Invoke
calc |> PCalc.run
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment