Skip to content

Instantly share code, notes, and snippets.

@mratsim
Last active September 16, 2017 09:26
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mratsim/dfbd944f64181727a97dffb30b8cbd0a to your computer and use it in GitHub Desktop.
Save mratsim/dfbd944f64181727a97dffb30b8cbd0a to your computer and use it in GitHub Desktop.
Calling CUDA kernels from Nim
import nimcuda/[cuda_runtime_api, driver_types, nimcuda]
import sequtils, future
type GpuArray[T: SomeReal] = object
data: ref[ptr T]
len: int
{.compile: "./square.cu".}
proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc, header:"../square.cuh".}
#../square.cuh is a workaround because header is not copied to nimcache
## Compute the square of x and store it in y
## bpg: BlocksPerGrid
## tpb: ThreadsPerBlock
proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}=
let s = size * sizeof(T)
check cudaMalloc(cast[ptr pointer](addr result), s)
proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}=
if not p[].isNil:
check cudaFree(p[])
proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}=
new(result.data, deallocCuda)
result.len = len
result.data[] = cudaMalloc[T](result.len)
proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}=
result = newGpuArray[T](s.len)
let size = result.len * sizeof(T)
check cudaMemCpy(result.data[],
unsafeAddr s[0],
size,
cudaMemcpyHostToDevice)
proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}=
result = newSeq[T](g.len)
let size = result.len * sizeof(T)
check cudaMemCpy(addr result[0],
g.data[],
size,
cudaMemcpyDeviceToHost)
proc main() =
let a = newSeq[float32](64)
let b = toSeq(0..63).map(x => x.float32)
echo a
echo b
var u = a.cuda
let v = b.cuda
cuda_square(1.cint, 64.cint, u.data[],v.data[])
check cudaDeviceSynchronize()
let z = u.cpu
echo z
main()
## Output:
# @[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
# @[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0]
# @[0.0, 1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0, 81.0, 100.0, 121.0, 144.0, 169.0, 196.0, 225.0, 256.0, 289.0, 324.0, 361.0, 400.0, 441.0, 484.0, 529.0, 576.0, 625.0, 676.0, 729.0, 784.0, 841.0, 900.0, 961.0, 1024.0, 1089.0, 1156.0, 1225.0, 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, 1681.0, 1764.0, 1849.0, 1936.0, 2025.0, 2116.0, 2209.0, 2304.0, 2401.0, 2500.0, 2601.0, 2704.0, 2809.0, 2916.0, 3025.0, 3136.0, 3249.0, 3364.0, 3481.0, 3600.0, 3721.0, 3844.0, 3969.0]
#include "square.cuh"
__global__ void square(float * d_out, float * d_in){
int idx = threadIdx.x;
float f = d_in[idx];
d_out[idx] = f * f;
}
void cuda_square(int bpg, int tpb, float * d_out, float * d_in){
square<<<bpg,tpb>>>(d_out, d_in);
}
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
void cuda_square(int bpg, int tpb, float * d_out, float * d_in);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment