mratsim/call_cuda.nim

## call_cuda.nim
import nimcuda/[cuda_runtime_api, driver_types, nimcuda]
import sequtils, future

type GpuArray[T: SomeReal] = object
  data: ref[ptr T]
  len: int

{.compile: "./square.cu".}
proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc, header:"../square.cuh".}
#../square.cuh is a workaround because header is not copied to nimcache
## Compute the square of x and store it in y
## bpg: BlocksPerGrid
## tpb: ThreadsPerBlock

proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}=
  let s = size * sizeof(T)
  check cudaMalloc(cast[ptr pointer](addr result), s)

proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}=
  if not p[].isNil:
    check cudaFree(p[])

proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}=
  new(result.data, deallocCuda)
  result.len = len
  result.data[] = cudaMalloc[T](result.len)

proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}=
  result = newGpuArray[T](s.len)

  let size = result.len * sizeof(T)

  check cudaMemCpy(result.data[],
                   unsafeAddr s[0],
                   size,
                   cudaMemcpyHostToDevice)

proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}=
  result = newSeq[T](g.len)

  let size = result.len * sizeof(T)

  check cudaMemCpy(addr result[0],
                   g.data[],
                   size,
                   cudaMemcpyDeviceToHost)


proc main() =
  let a = newSeq[float32](64)

  let b = toSeq(0..63).map(x => x.float32)

  echo a
  echo b

  var u = a.cuda
  let v = b.cuda

  cuda_square(1.cint, 64.cint, u.data[],v.data[])

  check cudaDeviceSynchronize()

  let z = u.cpu
  echo z

main()
## Output:

# @[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
# @[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0]
# @[0.0, 1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0, 81.0, 100.0, 121.0, 144.0, 169.0, 196.0, 225.0, 256.0, 289.0, 324.0, 361.0, 400.0, 441.0, 484.0, 529.0, 576.0, 625.0, 676.0, 729.0, 784.0, 841.0, 900.0, 961.0, 1024.0, 1089.0, 1156.0, 1225.0, 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, 1681.0, 1764.0, 1849.0, 1936.0, 2025.0, 2116.0, 2209.0, 2304.0, 2401.0, 2500.0, 2601.0, 2704.0, 2809.0, 2916.0, 3025.0, 3136.0, 3249.0, 3364.0, 3481.0, 3600.0, 3721.0, 3844.0, 3969.0]

## square.cu
#include "square.cuh"

__global__ void square(float * d_out, float * d_in){
    int idx = threadIdx.x;
    float f = d_in[idx];
    d_out[idx] = f * f;
}


void cuda_square(int bpg, int tpb, float * d_out, float * d_in){
    square<<<bpg,tpb>>>(d_out, d_in);
}

## square.cuh
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

void cuda_square(int bpg, int tpb, float * d_out, float * d_in);
	import nimcuda/[cuda_runtime_api, driver_types, nimcuda]
	import sequtils, future

	type GpuArray[T: SomeReal] = object
	data: ref[ptr T]
	len: int

	{.compile: "./square.cu".}
	proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc, header:"../square.cuh".}
	#../square.cuh is a workaround because header is not copied to nimcache
	## Compute the square of x and store it in y
	## bpg: BlocksPerGrid
	## tpb: ThreadsPerBlock

	proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}=
	let s = size * sizeof(T)
	check cudaMalloc(cast[ptr pointer](addr result), s)

	proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}=
	if not p[].isNil:
	check cudaFree(p[])

	proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}=
	new(result.data, deallocCuda)
	result.len = len
	result.data[] = cudaMalloc[T](result.len)

	proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}=
	result = newGpuArray[T](s.len)

	let size = result.len * sizeof(T)

	check cudaMemCpy(result.data[],
	unsafeAddr s[0],
	size,
	cudaMemcpyHostToDevice)

	proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}=
	result = newSeq[T](g.len)

	let size = result.len * sizeof(T)

	check cudaMemCpy(addr result[0],
	g.data[],
	size,
	cudaMemcpyDeviceToHost)


	proc main() =
	let a = newSeq[float32](64)

	let b = toSeq(0..63).map(x => x.float32)

	echo a
	echo b

	var u = a.cuda
	let v = b.cuda

	cuda_square(1.cint, 64.cint, u.data[],v.data[])

	check cudaDeviceSynchronize()

	let z = u.cpu
	echo z

	main()
	## Output:

	# @[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
	# @[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0]
	# @[0.0, 1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0, 81.0, 100.0, 121.0, 144.0, 169.0, 196.0, 225.0, 256.0, 289.0, 324.0, 361.0, 400.0, 441.0, 484.0, 529.0, 576.0, 625.0, 676.0, 729.0, 784.0, 841.0, 900.0, 961.0, 1024.0, 1089.0, 1156.0, 1225.0, 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, 1681.0, 1764.0, 1849.0, 1936.0, 2025.0, 2116.0, 2209.0, 2304.0, 2401.0, 2500.0, 2601.0, 2704.0, 2809.0, 2916.0, 3025.0, 3136.0, 3249.0, 3364.0, 3481.0, 3600.0, 3721.0, 3844.0, 3969.0]
	#include "square.cuh"

	__global__ void square(float * d_out, float * d_in){
	int idx = threadIdx.x;
	float f = d_in[idx];
	d_out[idx] = f * f;
	}


	void cuda_square(int bpg, int tpb, float * d_out, float * d_in){
	square<<<bpg,tpb>>>(d_out, d_in);
	}
	#include "cuda.h"
	#include "cuda_runtime.h"
	#include "device_launch_parameters.h"

	void cuda_square(int bpg, int tpb, float * d_out, float * d_in);