Last active
September 16, 2017 09:26
-
-
Save mratsim/dfbd944f64181727a97dffb30b8cbd0a to your computer and use it in GitHub Desktop.
Calling CUDA kernels from Nim
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nimcuda/[cuda_runtime_api, driver_types, nimcuda] | |
import sequtils, future | |
type GpuArray[T: SomeReal] = object | |
data: ref[ptr T] | |
len: int | |
{.compile: "./square.cu".} | |
proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc, header:"../square.cuh".} | |
#../square.cuh is a workaround because header is not copied to nimcache | |
## Compute the square of x and store it in y | |
## bpg: BlocksPerGrid | |
## tpb: ThreadsPerBlock | |
proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}= | |
let s = size * sizeof(T) | |
check cudaMalloc(cast[ptr pointer](addr result), s) | |
proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}= | |
if not p[].isNil: | |
check cudaFree(p[]) | |
proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}= | |
new(result.data, deallocCuda) | |
result.len = len | |
result.data[] = cudaMalloc[T](result.len) | |
proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}= | |
result = newGpuArray[T](s.len) | |
let size = result.len * sizeof(T) | |
check cudaMemCpy(result.data[], | |
unsafeAddr s[0], | |
size, | |
cudaMemcpyHostToDevice) | |
proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}= | |
result = newSeq[T](g.len) | |
let size = result.len * sizeof(T) | |
check cudaMemCpy(addr result[0], | |
g.data[], | |
size, | |
cudaMemcpyDeviceToHost) | |
proc main() = | |
let a = newSeq[float32](64) | |
let b = toSeq(0..63).map(x => x.float32) | |
echo a | |
echo b | |
var u = a.cuda | |
let v = b.cuda | |
cuda_square(1.cint, 64.cint, u.data[],v.data[]) | |
check cudaDeviceSynchronize() | |
let z = u.cpu | |
echo z | |
main() | |
## Output: | |
# @[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | |
# @[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0] | |
# @[0.0, 1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0, 81.0, 100.0, 121.0, 144.0, 169.0, 196.0, 225.0, 256.0, 289.0, 324.0, 361.0, 400.0, 441.0, 484.0, 529.0, 576.0, 625.0, 676.0, 729.0, 784.0, 841.0, 900.0, 961.0, 1024.0, 1089.0, 1156.0, 1225.0, 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, 1681.0, 1764.0, 1849.0, 1936.0, 2025.0, 2116.0, 2209.0, 2304.0, 2401.0, 2500.0, 2601.0, 2704.0, 2809.0, 2916.0, 3025.0, 3136.0, 3249.0, 3364.0, 3481.0, 3600.0, 3721.0, 3844.0, 3969.0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "square.cuh" | |
__global__ void square(float * d_out, float * d_in){ | |
int idx = threadIdx.x; | |
float f = d_in[idx]; | |
d_out[idx] = f * f; | |
} | |
void cuda_square(int bpg, int tpb, float * d_out, float * d_in){ | |
square<<<bpg,tpb>>>(d_out, d_in); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "cuda.h" | |
#include "cuda_runtime.h" | |
#include "device_launch_parameters.h" | |
void cuda_square(int bpg, int tpb, float * d_out, float * d_in); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment