mratsim/llvm.nim

## llvm.nim
# Constantine
# Copyright (c) 2018-2019    Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

{.passc: gorge("llvm-config --cflags").}
{.passl: gorge("llvm-config --libs").}

# ############################################################
#
#                 Bindings to LLVM JIT
#
# ############################################################

# https://llvm.org/doxygen/group__LLVMC.html

# Constantine is a library. It is possible that applications relying on Constantine
# also link to libLLVM, for example if they implement a virtual machine (for the EVM, for Snarks/zero-knowledge, ...).
# Hence Constantine should always use LLVM context to "namespace" its own codegen and avoid collisions in the global context.

# ############################################################
#
#                         LLVM
#
# ############################################################

# TODO: by using the dynlib pragma (https://nim-lang.org/docs/manual.html#foreign-function-interface-dynlib-pragma-for-import)
# we wouldn't need the LLVM dev packages, only the runtime, significantly easing installation and install size.
# However there wouldn't be headers, openArray arguments would need a wrapper.

type
  LlvmBool* = distinct int32
  MemoryBufferRef* {.importc: "LLVMMemoryBufferRef", header: "<llvm-c/Core.h>".} = distinct pointer
  ContextRef* {.importc: "LLVMContextRef", header: "<llvm-c/Core.h>".} = distinct pointer
  ModuleRef* {.importc: "LLVMModuleRef", header: "<llvm-c/Core.h>".} = distinct pointer
  TargetRef* {.importc: "LLVMTargetRef", header: "<llvm-c/Types.h>".} = distinct pointer
  ExecutionEngineRef* {.importc: "LLVMExecutionEngineRef", header: "<llvm-c/ExecutionEngine.h>".} = distinct pointer
  TypeRef* {.importc: "LLVMTypeRef", header: "<llvm-c/Core.h>".} = distinct pointer
  ValueRef* {.importc: "LLVMValueRef", header: "<llvm-c/Core.h>".} = distinct pointer
  NamedMDNodeRef* {.importc: "LLVMNamedMDNodeRef", header: "<llvm-c/Types.h>".} = distinct pointer
  MetadataRef* {.importc: "LLVMMetadataRef",  header: "<llvm-c/Types.h>".} = distinct pointer

{.push header: "<llvm-c/Core.h>".}

proc createContext*(): ContextRef {.importc: "LLVMContextCreate".}
proc dispose*(ctx: ContextRef) {.importc: "LLVMContextDispose".}

proc dispose*(msg: cstring) {.importc: "LLVMDisposeMessage".}
  ## cstring in LLVM are owned by LLVM and must be destroyed with a specific function

proc dispose*(buf: MemoryBufferRef){.importc: "LLVMDisposeMemoryBuffer".}
proc getBufferStart*(buf: MemoryBufferRef): ptr byte {.importc: "LLVMGetBufferStart".}
proc getBufferSize*(buf: MemoryBufferRef): csize_t {.importc: "LLVMGetBufferSize".}

{.pop.} # {.push header: "<llvm-c/Core.h>".}

# ############################################################
#
#                         Module
#
# ############################################################

{.push header: "<llvm-c/Core.h>".}

proc llvmCreateModule(name: cstring, ctx: ContextRef): ModuleRef {.importc: "LLVMModuleCreateWithNameInContext".}
template createModule*(ctx: ContextRef, name: cstring): ModuleRef =
  llvmCreateModule(name, ctx)
proc dispose*(m: ModuleRef) {.importc: "LLVMDisposeModule".}
  ## Destroys a module
  ## Note: destroying an Execution Engine will also destroy modules attached to it
proc toIRString*(m: ModuleRef): cstring {.importc: "LLVMPrintModuleToString".}
  ## Print a module IR to textual IR string. The string must be disposed with LLVM "dispose" or memory will leak.
proc getContext*(m: ModuleRef): ContextRef {.importc: "LLVMGetModuleContext".}

proc getOrInsertNamedMetadata*(m: ModuleRef, name: openArray[char]): NamedMDNodeRef {.importc: "LLVMGetOrInsertNamedMetadata".}
proc addNamedMetadataOperand*(m: ModuleRef, name: cstring, val: ValueRef) {.importc: "LLVMAddNamedMetadataOperand".}
proc metadataNode*(ctx: ContextRef, metadataNodes: openarray[MetadataRef]): MetadataRef {.importc: "LLVMMDNodeInContext2".}
proc metadataNode*(ctx: ContextRef, str: openArray[char]): MetadataRef {.importc: "LLVMMDStringInContext2".}
proc asMetadataRef*(val: ValueRef): MetadataRef {.importc: "LLVMValueAsMetadata".}
proc asValueRef*(ctx: ContextRef, md: MetadataRef): ValueRef {.importc: "LLVMMetadataAsValue".}

{.pop.} # {.push header: "<llvm-c/Core.h>".}

{.push header: "<llvm-c/BitWriter.h>".}
proc writeBitcodeToFile*(m: ModuleRef, path: cstring) {.importc: "LLVMWriteBitcodeToFile".}
proc writeBitcodeToMemoryBuffer*(m: ModuleRef): MemoryBufferRef {.importc: "LLVMWriteBitcodeToMemoryBuffer".}
  ## Write bitcode to a memory buffer
  ## The MemoryBuffer must be disposed appropriately or memory will leak
{.pop.} # {.push header: "<llvm-c/BitWriter.h>".}

proc toBitcode*(m: ModuleRef): seq[byte] =
  ## Print a module IR to bitcode
  let mb = m.writeBitcodeToMemoryBuffer()
  let len = mb.getBufferSize()
  result.newSeq(len)
  copyMem(result[0].addr, mb.getBufferStart(), len)
  mb.dispose()

type VerifierFailureAction* {.size: sizeof(cint).} = enum
  AbortProcessAction # verifier will print to stderr and abort()
  PrintMessageAction # verifier will print to stderr and return 1
  ReturnStatusAction # verifier will just return 1

{.push header: "<llvm-c/Analysis.h>".}
proc verify*(module: ModuleRef, failureAction: VerifierFailureAction, msg: var cstring): LlvmBool {.importc: "LLVMVerifyModule".}
proc verify*(fn: ValueRef, failureAction: VerifierFailureAction): LlvmBool {.importc: "LLVMVerifyFunction".}
{.pop.}

# ############################################################
#
#                         Target
#
# ############################################################

{.push header: "<llvm-c/Target.h>".}
proc initializeNativeTarget*(): LlvmBool {.discardable, importc: "LLVMInitializeNativeTarget".}
proc initializeNativeAsmPrinter*(): LlvmBool {.discardable, importc: "LLVMInitializeNativeAsmPrinter".}
proc getTargetFromName*(name: cstring): TargetRef {.importc: "LLVMGetTargetFromName".}
{.pop.}

{.push header: "<llvm-c/Core.h>".}
proc setTarget*(module: ModuleRef, triple: cstring) {.importc: "LLVMSetTarget".}
proc setDataLayout*(module: ModuleRef, layout: cstring) {.importc: "LLVMSetDataLayout".}
{.pop.}

# ############################################################
#
#                    Execution Engine
#
# ############################################################

{.push header: "<llvm-c/ExecutionEngine.h>".}
proc linkInMCJIT*() {.importc: "LLVMLinkInMCJIT".}
proc createJITCompilerForModule*(
       engine: var ExecutionEngineRef,
       module: ModuleRef,
       optLevel: uint32,
       err: var cstring): LlvmBool {.importc: "LLVMCreateJITCompilerForModule".}
proc remove*(
       engine: ExecutionEngineRef,
       m: ModuleRef,
       outMod: var ModuleRef,
       err: var cstring): LlvmBool {.importc: "LLVMRemoveModule".}
proc dispose*(engine: ExecutionEngineRef) {.importc: "LLVMDisposeExecutionEngine".}
  ## Destroys an execution engine
  ## Note: destroying an Execution Engine will also destroy modules attached to it
proc getFunctionAddress*(engine: ExecutionEngineRef, name: cstring): distinct pointer {.importc: "LLVMGetFunctionAddress".}
{.pop}

# ############################################################
#
#                         Types
#
# ############################################################

# https://llvm.org/doxygen/group__LLVMCCoreType.html

type
  TypeKind* {.size: sizeof(cint).} = enum
    tkVoid,           ## type with no size
    tkHalf,           ## 16 bit floating point type
    tkFloat,          ## 32 bit floating point type
    tkDouble,         ## 64 bit floating point type
    tkX86_FP80,       ## 80 bit floating point type (X87)
    tkFP128,          ## 128 bit floating point type (112-bit mantissa)
    tkPPC_FP128,      ## 128 bit floating point type (two 64-bits)
    tkLabel,          ## Labels
    tkInteger,        ## Arbitrary bit width integers
    tkFunction,       ## Functions
    tkStruct,         ## Structures
    tkArray,          ## Arrays
    tkPointer,        ## Pointers
    tkVector,         ## Fixed width SIMD vector type
    tkMetadata,       ## Metadata
    tkX86_MMX,        ## X86 MMX
    tkToken,          ## Tokens
    tkScalableVector, ## Scalable SIMD vector type
    tkBFloat,         ## 16 bit brain floating point type
    tkX86_AMX         ## X86 AMX

{.push header: "<llvm-c/Core.h>".}

proc getTypeKind*(ty: TypeRef): TypeKind {.importc: "LLVMGetTypeKind".}

proc void_t*(ctx: ContextRef): TypeRef {.importc: "LLVMVoidTypeInContext".}

# Integers
# ------------------------------------------------------------
proc int1_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt1TypeInContext".}
proc int8_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt8TypeInContext".}
proc int16_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt16TypeInContext".}
proc int32_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt32TypeInContext".}
proc int64_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt64TypeInContext".}
proc int128_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt128TypeInContext".}
proc int_t*(ctx: ContextRef, numBits: uint32): TypeRef {.importc: "LLVMIntTypeInContext".}

# Composite
# ------------------------------------------------------------
proc struct_t*(
       ctx: ContextRef,
       elemTypes: openArray[TypeRef], # requires implicit conversion of the length to uint32, which requires the header
       packed: LlvmBool): TypeRef {.importc: "LLVMStructTypeInContext".}
proc array_t*(elemType: TypeRef, elemCount: uint32): TypeRef {.importc: "LLVMArrayType".}

proc pointerType(elementType: TypeRef; addressSpace: cuint): TypeRef {.importc: "LLVMPointerType".}

# Functions
# ------------------------------------------------------------
proc function_t*(
       returnType: TypeRef,
       paramTypes: openArray[TypeRef], # requires implicit conversion of the length to uint32, which requires the header
       isVarArg: LlvmBool): TypeRef {.importc: "LLVMFunctionType".}

proc addFunction*(m: ModuleRef, name: cstring, ty: TypeRef): ValueRef {.importc: "LLVMAddFunction".}
  ## Declare a function `name` in a module.
  ## Returns a handle to specify its instructions

proc printTypeToString(ty: TypeRef): cstring {.importc: "LLVMPrintTypeToString".}

# TODO: Function and Parameter attributes:
# - https://www.llvm.org/docs/LangRef.html?highlight=attribute#function-attributes
# - https://www.llvm.org/docs/LangRef.html?highlight=attribute#parameter-attributes
#
# We can use attributes to specify additional guarantees of Constantine code, for instance:
# - "pure" function with: nounwind, readonly
# - pointer particularities: readonly, writeonly, noalias, inalloca, byval

proc getReturnType*(functionTy: TypeRef): TypeRef {.importc: "LLVMGetReturnType".}

{.pop.} # {.push header: "<llvm-c/Core.h>".}

# ------------------------------

proc `$`*(ty: TypeRef): string =
  let s = ty.printTypeToString()
  result = $s
  s.dispose()

proc isVoid*(ty: TypeRef): bool {.inline.} =
  ty.getTypeKind == tkVoid

proc pointer_t*(elementTy: TypeRef): TypeRef {.inline.} =
  pointerType(elementTy, addressSpace = 0)

# ############################################################
#
#                         Values
#
# ############################################################

{.push header: "<llvm-c/Core.h>".}

proc getTypeOf*(x: ValueRef): TypeRef {.importc: "LLVMTypeOf".}

# Constants
# ------------------------------------------------------------
# https://llvm.org/doxygen/group__LLVMCCoreValueConstant.html

proc constInt*(ty: TypeRef, n: culonglong, signExtend: LlvmBool): ValueRef {.importc: "LLVMConstInt".}
proc constReal*(ty: TypeRef, n: cdouble): ValueRef {.importc: "LLVMConstReal".}

proc constNull*(ty: TypeRef): ValueRef {.importc: "LLVMConstNull".}
proc constAllOnes*(ty: TypeRef): ValueRef {.importc: "LLVMConstAllOnes".}
proc constStruct*(
       vals: openArray[ValueRef], # requires implicit conversion of the length to uint32, which requires the header
       packed: LlvmBool): ValueRef {.importc: "LLVMConstStruct".}
proc constArray*(
       ty: TypeRef,
       constantVals: openArray[ValueRef] # requires implicit conversion of the length to uint32, which requires the header
    ): ValueRef {.importc: "LLVMConstArray".}

{.pop.} # {.push header: "<llvm-c/Core.h>".}

# ############################################################
#
#                      IR builder
#
# ############################################################

# https://llvm.org/doxygen/group__LLVMCCoreInstructionBuilder.html

type
  BasicBlockRef* {.importc: "LLVMBasicBlockRef", header: "<llvm-c/Core.h>".} = distinct pointer
  BuilderRef* {.importc: "LLVMBuilderRef", header: "<llvm-c/Core.h>".} = distinct pointer
    ##  An instruction builder represents a point within a basic block and is
    ##  the exclusive means of building instructions using the C interface.

  IntPredicate* {.size: sizeof(cint).} = enum
    IntEQ = 32,               ## equal
    IntNE,                    ## not equal
    IntUGT,                   ## unsigned greater than
    IntUGE,                   ## unsigned greater or equal
    IntULT,                   ## unsigned less than
    IntULE,                   ## unsigned less or equal
    IntSGT,                   ## signed greater than
    IntSGE,                   ## signed greater or equal
    IntSLT,                   ## signed less than
    IntSLE                    ## signed less or equal

{.push header: "<llvm-c/Core.h>".}

# Instantiation
# ------------------------------------------------------------

proc appendBasicBlock*(ctx: ContextRef, fn: ValueRef, name: cstring): BasicBlockRef {.importc: "LLVMAppendBasicBlockInContext".}
  ## Append a basic block to the end of a function

proc createBuilder*(ctx: ContextRef): BuilderRef {.importc: "LLVMCreateBuilderInContext".}
proc dispose*(builder: BuilderRef) {.importc: "LLVMDisposeBuilder".}

# Functions
# ------------------------------------------------------------

proc getParam*(fn: ValueRef, index: uint32): ValueRef {.importc: "LLVMGetParam".}
proc retVoid*(builder: BuilderRef): ValueRef {.importc: "LLVMBuildRetVoid".}
proc ret*(builder: BuilderRef, returnVal: ValueRef) {.importc: "LLVMBuildRet".}

# Positioning
# ------------------------------------------------------------

proc position*(builder: BuilderRef, blck: BasicBlockRef, instr: ValueRef) {.importc: "LLVMPositionBuilder".}
proc positionBefore*(builder: BuilderRef, instr: ValueRef) {.importc: "LLVMPositionBuilderBefore".}
proc positionAtEnd*(builder: BuilderRef, blck: BasicBlockRef) {.importc: "LLVMPositionBuilderAtEnd".}

# Intermediate Representation
# ------------------------------------------------------------
#
# - NSW: no signed wrap, signed value cannot over- or underflow.
# - NUW: no unsigned wrap, unsigned value cannot over- or underflow.

proc add*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildAdd".}
proc addNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWAdd".}
proc addNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWAdd".}

proc sub*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildSub".}
proc subNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWSub".}
proc subNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWSub".}

proc neg*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNeg".}
proc negNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWNeg".}
proc negNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWNeg".}

proc mul*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildMul".}
proc mulNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWMul".}
proc mulNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWMul".}

proc divU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildUDiv".}
proc divU_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildExactUDiv".}
proc divS*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildSDiv".}
proc divS_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildExactSDiv".}
proc remU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildURem".}
proc remS*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildSRem".}

proc lshl*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildShl".}
proc lshr*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildLShr".}
proc ashr*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildAShr".}

proc `and`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildAnd".}
proc `or`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildOr".}
proc `xor`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildXor".}
proc `not`*(builder: BuilderRef, val: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNot".}
proc select*(builder: BuilderRef, condition, then, otherwise: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNot".}

proc icmp*(builder: BuilderRef, op: IntPredicate, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildICmp".}

proc bitcast*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildBitcast".}
proc trunc*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildTrunc".}
proc zext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildZExt".}
  ## Zero-extend
proc sext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildSExt".}
  ## Sign-extend

proc malloc*(builder: BuilderRef, ty: TypeRef): ValueRef {.importc: "LLVMBuildMalloc".}
proc mallocArray*(builder: BuilderRef, ty: TypeRef, val: ValueRef): ValueRef {.importc: "LLVMBuildMallocArray".}
proc free*(builder: BuilderRef, ty: TypeRef, `ptr`: ValueRef): ValueRef {.importc: "LLVMBuildFree".}
proc alloca*(builder: BuilderRef, ty: TypeRef): ValueRef {.importc: "LLVMBuildAlloca".}
proc allocaArray*(builder: BuilderRef, ty: TypeRef, val: ValueRef): ValueRef {.importc: "LLVMBuildAllocaArray".}

proc getElementPtr2*(
       builder: BuilderRef,
       ty: TypeRef,
       `ptr`: ValueRef,
       indices: openArray[ValueRef], # requires implicit conversion of the length to uint32, which requires the header
       name: cstring
     ): ValueRef {.importc: "LLVMBuildGEP2".}
  ## https://www.llvm.org/docs/GetElementPtr.html
proc getElementPtr2_InBounds*(
       builder: BuilderRef,
       ty: TypeRef,
       `ptr`: ValueRef,
       indices: openArray[ValueRef], # requires implicit conversion of the length to uint32, which requires the header
       name: cstring
     ): ValueRef {.importc: "LLVMBuildInBoundsGEP2".}
  ## https://www.llvm.org/docs/GetElementPtr.html
  ## If the GEP lacks the inbounds keyword, the value is the result from evaluating the implied two’s complement integer computation.
  ## However, since there’s no guarantee of where an object will be allocated in the address space, such values have limited meaning.
proc getElementPtr2_Struct*(
       builder: BuilderRef,
       ty: TypeRef,
       `ptr`: ValueRef,
       idx: uint32,
       name: cstring
     ): ValueRef {.importc: "LLVMBuildStructGEP2".}
  ## https://www.llvm.org/docs/GetElementPtr.html
  ## If the GEP lacks the inbounds keyword, the value is the result from evaluating the implied two’s complement integer computation.
  ## However, since there’s no guarantee of where an object will be allocated in the address space, such values have limited meaning.

proc load2*(builder: BuilderRef, ty: TypeRef, `ptr`: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildLoad2".}
proc store*(builder: BuilderRef, val, `ptr`: ValueRef): ValueRef {.importc: "LLVMBuildStore".}

proc memset*(builder: BuilderRef, `ptr`, val, len: ValueRef, align: uint32) {.importc: "LLVMBuildMemset".}
proc memcpy*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemcpy".}
proc memmove*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemmove".}

{.pop.} # {.push header: "<llvm-c/Core.h>".}


# ############################################################
#
#                    Sanity Check
#
# ############################################################

when isMainModule:
  echo "LLVM JIT compiler sanity check"

  let ctx = createContext()
  var module = ctx.createModule("addition")
  let i32 = ctx.int32_t()

  let addType = function_t(i32, [i32, i32], isVarArg = LlvmBool(false))
  let addBody = module.addFunction("add", addType)

  let builder = ctx.createBuilder()
  let blck = ctx.append_basic_block(addBody, "addBody")
  builder.positionAtEnd(blck)

  block:
    let a = addBody.getParam(0)
    let b = addBody.getParam(1)
    let sum = builder.add(a, b, "sum")
    builder.ret(sum)

  block:
    var errMsg: cstring
    let errCode = module.verify(AbortProcessAction, errMsg)
    echo "Verification: code ", int(errCode), ", message \"", errMsg, "\""
    errMsg.dispose()

  var engine: ExecutionEngineRef
  block:
    let errCode = initializeNativeTarget()
    echo "Target init: code ", int(errCode)
    let errCodeASMPrinter = initializeNativeAsmPrinter() # Why is this necessary? Otherwise we get "LLVM ERROR: Target does not support MC emission!"
    echo "ASM printer init: code ", int(errCodeASMPrinter)
    var errMsg: cstring
    if bool createJITCompilerForModule(engine, module, optLevel = 0, errMsg):
      if errMsg.len > 0:
        echo errMsg
        echo "exiting ..."
      else:
        echo "JIT compiler: error without details ... exiting"
      quit 1

  let jitAdd = cast[proc(a, b: int32): int32 {.noconv.}](
    engine.getFunctionAddress("add"))

  echo "jitAdd(1, 2) = ", jitAdd(1, 2)
  doAssert jitAdd(1, 2) == 1 + 2

  block:
    # Cleanup
    # Note: when disposing the Execution Engine, attached modules are also disposed.
    #       here we go the extra mile of detaching the module for testing.
    builder.dispose()
    var errMsg: cstring
    let errCode = engine.remove(module, module, errMsg)
    echo "Detaching module from Execution Engine: code ", int(errCode), ", message \"", errMsg, "\""
    module.dispose()
    engine.dispose()
    ctx.dispose()
  echo "LLVM JIT - SUCCESS"

## nvidia.nim
# Constantine
# Copyright (c) 2018-2019    Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# ############################################################
#
#               Bindings to Nvidia GPUs libraries
#
# ############################################################

import ./llvm

type Flag*[E: enum] = distinct cint

func flag*[E: enum](e: varargs[E]): Flag[E] {.inline.} =
  ## Enum should only have power of 2 fields
  # static:
  #   for val in E:
  #     assert (ord(val) and (ord(val) - 1)) == 0, "Enum values should all be power of 2, found " &
  #                                                 $val & " with value " & $ord(val) & "."
  var flags = 0
  for val in e:
    flags = flags or ord(val)
  result = Flag[E](flags)

# ############################################################
#
#                         Cuda
#
# ############################################################

{.passl: "-L/opt/cuda/lib64 -lcuda".}

# Cuda offers 2 APIs:
# - cuda.h               the driver API
# - cuda_runtime.h       the runtime API
#
# https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html
#
# We need to use the lower-level driver API for JIT modules loading and reloading

type
  CUresult* {.size: sizeof(cint).} = enum
    ##  The API call returned with no errors. In the case of query calls, this
    ##  also means that the operation being queried is complete (see
    ##  ::cuEventQuery() and ::cuStreamQuery()).
    CUDA_SUCCESS = 0
    ##  This indicates that one or more of the parameters passed to the API call
    ##  is not within an acceptable range of values.
    CUDA_ERROR_INVALID_VALUE = 1
    ##  The API call failed because it was unable to allocate enough memory to
    ##  perform the requested operation.
    CUDA_ERROR_OUT_OF_MEMORY = 2
    ##  This indicates that the CUDA driver has not been initialized with
    ##  ::cuInit() or that initialization has failed.
    CUDA_ERROR_NOT_INITIALIZED = 3
    ##  This indicates that the CUDA driver is in the process of shutting down.
    CUDA_ERROR_DEINITIALIZED = 4
    ##  This indicates profiler is not initialized for this run. This can
    ##  happen when the application is running with external profiling tools
    ##  like visual profiler.
    CUDA_ERROR_PROFILER_DISABLED = 5
    ##  to attempt to enable/disable the profiling via ::cuProfilerStart or
    ##  ::cuProfilerStop without initialization.
    CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
    ##  to call cuProfilerStart() when profiling is already enabled.
    CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
    ##  to call cuProfilerStop() when profiling is already disabled.
    CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
    ##  This indicates that the CUDA driver that the application has loaded is a
    ##  stub library. Applications that run with the stub rather than a real
    ##  driver loaded will result in CUDA API returning this error.
    CUDA_ERROR_STUB_LIBRARY = 34
    ##  This indicates that requested CUDA device is unavailable at the current
    ##  time. Devices are often unavailable due to use of
    ##  ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.
    CUDA_ERROR_DEVICE_UNAVAILABLE = 46
    ##  This indicates that no CUDA-capable devices were detected by the installed
    ##  CUDA driver.
    CUDA_ERROR_NO_DEVICE = 100
    ##  This indicates that the device ordinal supplied by the user does not
    ##  correspond to a valid CUDA device or that the action requested is
    ##  invalid for the specified device.
    CUDA_ERROR_INVALID_DEVICE = 101
    ##  This error indicates that the Grid license is not applied.
    CUDA_ERROR_DEVICE_NOT_LICENSED = 102
    ##  This indicates that the device kernel image is invalid. This can also
    ##  indicate an invalid CUDA module.
    CUDA_ERROR_INVALID_IMAGE = 200
    ##  This most frequently indicates that there is no context bound to the
    ##  current thread. This can also be returned if the context passed to an
    ##  API call is not a valid handle (such as a context that has had
    ##  ::cuCtxDestroy() invoked on it). This can also be returned if a user
    ##  mixes different API versions (i.e. 3010 context with 3020 API calls).
    ##  See ::cuCtxGetApiVersion() for more details.
    CUDA_ERROR_INVALID_CONTEXT = 201
    ##  This indicated that the context being supplied as a parameter to the
    ##  API call was already the active context.
    ##  error to attempt to push the active context via ::cuCtxPushCurrent().
    CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
    ##  This indicates that a map or register operation has failed.
    CUDA_ERROR_MAP_FAILED = 205
    ##  This indicates that an unmap or unregister operation has failed.
    CUDA_ERROR_UNMAP_FAILED = 206
    ##  This indicates that the specified array is currently mapped and thus
    ##  cannot be destroyed.
    CUDA_ERROR_ARRAY_IS_MAPPED = 207
    ##  This indicates that the resource is already mapped.
    CUDA_ERROR_ALREADY_MAPPED = 208
    ##  This indicates that there is no kernel image available that is suitable
    ##  for the device. This can occur when a user specifies code generation
    ##  options for a particular CUDA source file that do not include the
    ##  corresponding device configuration.
    CUDA_ERROR_NO_BINARY_FOR_GPU = 209
    ##  This indicates that a resource has already been acquired.
    CUDA_ERROR_ALREADY_ACQUIRED = 210
    ##  This indicates that a resource is not mapped.
    CUDA_ERROR_NOT_MAPPED = 211
    ##  This indicates that a mapped resource is not available for access as an
    ##  array.
    CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
    ##  This indicates that a mapped resource is not available for access as a
    ##  pointer.
    CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
    ##  This indicates that an uncorrectable ECC error was detected during
    ##  execution.
    CUDA_ERROR_ECC_UNCORRECTABLE = 214
    ##  This indicates that the ::CUlimit passed to the API call is not
    ##  supported by the active device.
    CUDA_ERROR_UNSUPPORTED_LIMIT = 215
    ##  This indicates that the ::CUcontext passed to the API call can
    ##  only be bound to a single CPU thread at a time but is already
    ##  bound to a CPU thread.
    CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
    ##  This indicates that peer access is not supported across the given
    ##  devices.
    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
    ##  This indicates that a PTX JIT compilation failed.
    CUDA_ERROR_INVALID_PTX = 218
    ##  This indicates an error with OpenGL or DirectX context.
    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
    ##  This indicates that an uncorrectable NVLink error was detected during the
    ##  execution.
    CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
    ##  This indicates that the PTX JIT compiler library was not found.
    CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
    ##  This indicates that the provided PTX was compiled with an unsupported toolchain.
    CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
    ##  This indicates that the PTX JIT compilation was disabled.
    CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
    ##  This indicates that the ::CUexecAffinityType passed to the API call is not
    ##  supported by the active device.
    CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
    ##  This indicates that the device kernel source is invalid. This includes
    ##  compilation/linker errors encountered in device code or user error.
    CUDA_ERROR_INVALID_SOURCE = 300
    ##  This indicates that the file specified was not found.
    CUDA_ERROR_FILE_NOT_FOUND = 301
    ##  This indicates that a link to a shared object failed to resolve.
    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
    ##  This indicates that initialization of a shared object failed.
    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
    ##  This indicates that an OS call failed.
    CUDA_ERROR_OPERATING_SYSTEM = 304
    ##  This indicates that a resource handle passed to the API call was not
    ##  valid. Resource handles are opaque types like ::CUstream and ::CUevent.
    CUDA_ERROR_INVALID_HANDLE = 400
    ##  This indicates that a resource required by the API call is not in a
    ##  valid state to perform the requested operation.
    CUDA_ERROR_ILLEGAL_STATE = 401
    ##  This indicates that a named symbol was not found. Examples of symbols
    ##  are global/constant variable names, driver function names, texture names,
    ##  and surface names.
    CUDA_ERROR_NOT_FOUND = 500
    ##  This indicates that asynchronous operations issued previously have not
    ##  completed yet. This result is not actually an error, but must be indicated
    ##  differently than ::CUDA_SUCCESS (which indicates completion). Calls that
    ##  may return this value include ::cuEventQuery() and ::cuStreamQuery().
    CUDA_ERROR_NOT_READY = 600
    ##  While executing a kernel, the device encountered a
    ##  load or store instruction on an invalid memory address.
    ##  This leaves the process in an inconsistent state and any further CUDA work
    ##  will return the same error. To continue using CUDA, the process must be terminated
    ##  and relaunched.
    CUDA_ERROR_ILLEGAL_ADDRESS = 700
    ##  This indicates that a launch did not occur because it did not have
    ##  appropriate resources. This error usually indicates that the user has
    ##  attempted to pass too many arguments to the device kernel, or the
    ##  kernel launch specifies too many threads for the kernel's register
    ##  count. Passing arguments of the wrong size (i.e. a 64-bit pointer
    ##  when a 32-bit int is expected) is equivalent to passing too many
    ##  arguments and can also result in this error.
    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
    ##  This indicates that the device kernel took too long to execute. This can
    ##  only occur if timeouts are enabled - see the device attribute
    ##  ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
    ##  This leaves the process in an inconsistent state and any further CUDA work
    ##  will return the same error. To continue using CUDA, the process must be terminated
    ##  and relaunched.
    CUDA_ERROR_LAUNCH_TIMEOUT = 702
    ##  This error indicates a kernel launch that uses an incompatible texturing
    ##  mode.
    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
    ##  This error indicates that a call to ::cuCtxEnablePeerAccess() is
    ##  trying to re-enable peer access to a context which has already
    ##  had peer access to it enabled.
    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
    ##  This error indicates that ::cuCtxDisablePeerAccess() is
    ##  trying to disable peer access which has not been enabled yet
    ##  via ::cuCtxEnablePeerAccess().
    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
    ##  This error indicates that the primary context for the specified device
    ##  has already been initialized.
    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
    ##  This error indicates that the context current to the calling thread
    ##  has been destroyed using ::cuCtxDestroy, or is a primary context which
    ##  has not yet been initialized.
    CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
    ##  A device-side assert triggered during kernel execution. The context
    ##  cannot be used anymore, and must be destroyed. All existing device
    ##  memory allocations from this context are invalid and must be
    ##  reconstructed if the program is to continue using CUDA.
    CUDA_ERROR_ASSERT = 710
    ##  This error indicates that the hardware resources required to enable
    ##  peer access have been exhausted for one or more of the devices
    ##  passed to ::cuCtxEnablePeerAccess().
    CUDA_ERROR_TOO_MANY_PEERS = 711
    ##  This error indicates that the memory range passed to ::cuMemHostRegister()
    ##  has already been registered.
    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
    ##  This error indicates that the pointer passed to ::cuMemHostUnregister()
    ##  does not correspond to any currently registered memory region.
    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
    ##  While executing a kernel, the device encountered a stack error.
    ##  This can be due to stack corruption or exceeding the stack size limit.
    ##  This leaves the process in an inconsistent state and any further CUDA work
    ##  will return the same error. To continue using CUDA, the process must be terminated
    ##  and relaunched.
    CUDA_ERROR_HARDWARE_STACK_ERROR = 714
    ##  While executing a kernel, the device encountered an illegal instruction.
    ##  This leaves the process in an inconsistent state and any further CUDA work
    ##  will return the same error. To continue using CUDA, the process must be terminated
    ##  and relaunched.
    CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
    ##  While executing a kernel, the device encountered a load or store instruction
    ##  on a memory address which is not aligned.
    ##  This leaves the process in an inconsistent state and any further CUDA work
    ##  will return the same error. To continue using CUDA, the process must be terminated
    ##  and relaunched.
    CUDA_ERROR_MISALIGNED_ADDRESS = 716
    ##  While executing a kernel, the device encountered an instruction
    ##  which can only operate on memory locations in certain address spaces
    ##  (global, shared, or local), but was supplied a memory address not
    ##  belonging to an allowed address space.
    ##  This leaves the process in an inconsistent state and any further CUDA work
    ##  will return the same error. To continue using CUDA, the process must be terminated
    ##  and relaunched.
    CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
    ##  While executing a kernel, the device program counter wrapped its address space.
    ##  This leaves the process in an inconsistent state and any further CUDA work
    ##  will return the same error. To continue using CUDA, the process must be terminated
    ##  and relaunched.
    CUDA_ERROR_INVALID_PC = 718
    ##  An exception occurred on the device while executing a kernel. Common
    ##  causes include dereferencing an invalid device pointer and accessing
    ##  out of bounds shared memory. Less common cases can be system specific - more
    ##  information about these cases can be found in the system specific user guide.
    ##  This leaves the process in an inconsistent state and any further CUDA work
    ##  will return the same error. To continue using CUDA, the process must be terminated
    ##  and relaunched.
    CUDA_ERROR_LAUNCH_FAILED = 719
    ##  This error indicates that the number of blocks launched per grid for a kernel that was
    ##  launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
    ##  exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
    ##  or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
    ##  as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
    ##  This error indicates that the attempted operation is not permitted.
    CUDA_ERROR_NOT_PERMITTED = 800
    ##  This error indicates that the attempted operation is not supported
    ##  on the current system or device.
    CUDA_ERROR_NOT_SUPPORTED = 801
    ##  This error indicates that the system is not yet ready to start any CUDA
    ##  work.  To continue using CUDA, verify the system configuration is in a
    ##  valid state and all required driver daemons are actively running.
    ##  More information about this error can be found in the system specific
    ##  user guide.
    CUDA_ERROR_SYSTEM_NOT_READY = 802
    ##  This error indicates that there is a mismatch between the versions of
    ##  the display driver and the CUDA driver. Refer to the compatibility documentation
    ##  for supported versions.
    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
    ##  This error indicates that the system was upgraded to run with forward compatibility
    ##  but the visible hardware detected by CUDA does not support this configuration.
    ##  Refer to the compatibility documentation for the supported hardware matrix or ensure
    ##  that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
    ##  environment variable.
    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
    ##  This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
    CUDA_ERROR_MPS_CONNECTION_FAILED = 805
    ##  This error indicates that the remote procedural call between the MPS server and the MPS client failed.
    CUDA_ERROR_MPS_RPC_FAILURE = 806
    ##  This error indicates that the MPS server is not ready to accept new MPS client requests.
    ##  This error can be returned when the MPS server is in the process of recovering from a fatal failure.
    CUDA_ERROR_MPS_SERVER_NOT_READY = 807
    ##  This error indicates that the hardware resources required to create MPS client have been exhausted.
    CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
    ##  This error indicates the the hardware resources required to support device connections have been exhausted.
    CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
    ##  This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
    CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
    ##  This error indicates that the operation is not permitted when
    ##  the stream is capturing.
    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
    ##  This error indicates that the current capture sequence on the stream
    ##  has been invalidated due to a previous error.
    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
    ##  This error indicates that the operation would have resulted in a merge
    ##  of two independent capture sequences.
    CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
    ##  This error indicates that the capture was not initiated in this stream.
    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
    ##  This error indicates that the capture sequence contains a fork that was
    ##  not joined to the primary stream.
    CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
    ##  This error indicates that a dependency would have been created which
    ##  crosses the capture sequence boundary. Only implicit in-stream ordering
    ##  dependencies are allowed to cross the boundary.
    CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
    ##  This error indicates a disallowed implicit dependency on a current capture
    ##  sequence from cudaStreamLegacy.
    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
    ##  This error indicates that the operation is not permitted on an event which
    ##  was last recorded in a capturing stream.
    CUDA_ERROR_CAPTURED_EVENT = 907
    ##  A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
    ##  argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
    ##  different thread.
    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
    ##  This error indicates that the timeout specified for the wait operation has lapsed.
    CUDA_ERROR_TIMEOUT = 909
    ##  This error indicates that the graph update was not performed because it included
    ##  changes which violated constraints specific to instantiated graph update.
    CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
    ##  This indicates that an async error has occurred in a device outside of CUDA.
    ##  If CUDA was waiting for an external device's signal before consuming shared data,
    ##  the external device signaled an error indicating that the data is not valid for
    ##  consumption. This leaves the process in an inconsistent state and any further CUDA
    ##  work will return the same error. To continue using CUDA, the process must be
    ##  terminated and relaunched.
    CUDA_ERROR_EXTERNAL_DEVICE = 911
    ##  Indicates a kernel launch error due to cluster misconfiguration.
    CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
    ##  This indicates that an unknown internal error has occurred.
    CUDA_ERROR_UNKNOWN = 999

template check*(status: CUresult) =
  ## Check the status code of a CUDA operation
  ## Exit program with error if failure

  let code = status # ensure that the input expression is evaluated once only
  if code != CUDA_SUCCESS:
    echo astToStr(status), " ", instantiationInfo(), " exited with error: ", code
    echo $code
    quit 1

type
  CUdevice* = distinct int32
    ## Compute Device handle

  CUdevice_attribute* {.size: sizeof(cint).} = enum
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,                          ## Maximum number of threads per block */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                                ## Maximum block dimension X */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                                ## Maximum block dimension Y */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                                ## Maximum block dimension Z */
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                                 ## Maximum grid dimension X */
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                                 ## Maximum grid dimension Y */
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                                 ## Maximum grid dimension Z */
    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,                    ## Maximum shared memory available per block in bytes */
    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,                          ## Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                                     ## Warp size in threads */
    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                                     ## Maximum pitch in bytes allowed by memory copies */
    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,                       ## Maximum number of 32-bit registers available per block */
    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                                    ## Typical clock frequency in kilohertz */
    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                             ## Alignment requirement for textures */
    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,                          ## Number of multiprocessors on device */
    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,                           ## Specifies whether there is a run time limit on kernels */
    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                                    ## Device is integrated with host memory */
    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,                           ## Device can map host memory into CUDA address space */
    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                                  ## Compute mode (See ::CUcomputemode for details) */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,                       ## Maximum 1D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,                       ## Maximum 2D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,                      ## Maximum 2D texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,                       ## Maximum 3D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,                      ## Maximum 3D texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,                       ## Maximum 3D texture depth */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,               ## Maximum 2D layered texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,              ## Maximum 2D layered texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,              ## Maximum layers in a 2D layered texture */
    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                             ## Alignment requirement for surfaces */
    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                            ## Device can possibly execute multiple kernels concurrently */
    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                                   ## Device has ECC support enabled */
    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                                    ## PCI bus ID of the device */
    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                                 ## PCI device ID of the device */
    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                                    ## Device is using TCC driver model */
    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                             ## Peak memory clock frequency in kilohertz */
    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,                       ## Global memory bus width in bits */
    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                                 ## Size of L2 cache in bytes */
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,                ## Maximum resident threads per multiprocessor */
    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                            ## Number of asynchronous engines */
    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                            ## Device shares a unified address space with the host */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,               ## Maximum 1D layered texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,              ## Maximum layers in a 1D layered texture */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,                ## Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,               ## Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,             ## Alternate maximum 3D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,            ## Alternate maximum 3D texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,             ## Alternate maximum 3D texture depth */
    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                                 ## PCI domain ID of the device */
    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,                       ## Pitch alignment requirement for textures */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,                  ## Maximum cubemap texture width/height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,          ## Maximum cubemap layered texture width/height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,         ## Maximum layers in a cubemap layered texture */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,                       ## Maximum 1D surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,                       ## Maximum 2D surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,                      ## Maximum 2D surface height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,                       ## Maximum 3D surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,                      ## Maximum 3D surface height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,                       ## Maximum 3D surface depth */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,               ## Maximum 1D layered surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,              ## Maximum layers in a 1D layered surface */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,               ## Maximum 2D layered surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,              ## Maximum 2D layered surface height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,              ## Maximum layers in a 2D layered surface */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,                  ## Maximum cubemap surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,          ## Maximum cubemap layered surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,         ## Maximum layers in a cubemap layered surface */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,                ## Maximum 2D linear texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,               ## Maximum 2D linear texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,                ## Maximum 2D linear texture pitch in bytes */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,             ## Maximum mipmapped 2D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,            ## Maximum mipmapped 2D texture height */
    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,                      ## Major compute capability version number */
    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,                      ## Minor compute capability version number */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,             ## Maximum mipmapped 1D texture width */
    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,                   ## Device supports stream priorities */
    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,                     ## Device supports caching globals in L1 */
    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,                      ## Device supports caching locals in L1 */
    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,          ## Maximum shared memory available per multiprocessor in bytes */
    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,              ## Maximum number of 32-bit registers available per multiprocessor */
    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                                ## Device can allocate managed memory on this system */
    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                               ## Device is on a multi-GPU board */
    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,                      ## Unique id for a group of devices on the same multi-GPU board */
    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,                  ## Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,         ## Ratio of single precision performance (in floating-point operations per second) to double precision performance */
    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,                        ## Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,                     ## Device can coherently access managed memory concurrently with the CPU */
    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,                  ## Device supports compute preemption. */
    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,       ## Device can access host registered memory at the same virtual address as the CPU */
    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,                        ## ::cuStreamBatchMemOp and related APIs are supported. */
    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,                 ## 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,                 ## ::CU_STREAM_WAIT_VALUE_NOR is supported. */
    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                            ## Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,             ## Maximum optin shared memory per block */
    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,                       ## The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,                       ## Device supports host memory registration via ::cudaHostRegister. */
    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, ## Device accesses pageable memory via the host's page tables. */
    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,          ## The host can directly access managed memory on the device without migration. */
    CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,          ## Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,  ## Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,           ## Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,       ## Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,                ## Maximum number of blocks per multiprocessor */
    CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,                ## Device supports compression of memory */
    CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,                 ## Maximum L2 persisting lines capacity setting in bytes. */
    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,                ## Maximum value of CUaccessPolicyWindow::num_bytes. */
    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,      ## Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
    CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,             ## Shared memory reserved by CUDA driver per block in bytes */
    CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,                  ## Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
    CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,            ## Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
    CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,         ## External timeline semaphore interop is supported on the device */
    CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,                       ## Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,                    ## Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,         ## The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,              ## GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
    CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,               ## Handle types supported with mempool based IPC */
    CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120,                               ## Indicates device supports cluster launch */
    CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121,        ## Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 = 122,             ## 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and related v2 MemOp APIs. */
    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 = 123,             ## ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */
    CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124,                            ## Device supports buffer sharing with dma_buf mechanism. */
    CU_DEVICE_ATTRIBUTE_MAX

  CUcontext* = distinct pointer
  CUmodule* = distinct pointer
  CUfunction* = distinct pointer
  CUstream* = distinct pointer
  CUdeviceptr* = distinct pointer

{.push cdecl, importc, dynlib: "libcuda.so".}
proc cuInit*(flags: uint32): CUresult

proc cuDeviceGetCount*(count: var int32): CUresult
proc cuDeviceGet*(device: var CUdevice, ordinal: int32): CUresult
proc cuDeviceGetName*(name: ptr char, len: int32, dev: CUdevice): CUresult
proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevice): CUresult

proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult
proc cuCtxDestroy*(ctx: CUcontext): CUresult
proc cuCtxSynchronize*(ctx: CUcontext): CUresult

proc cuModuleLoadData(module: var CUmodule, sourceCode: ptr char): CUresult
proc cuModuleUnload*(module: CUmodule): CUresult
proc cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult

proc cuLaunchKernel*(
       kernel: CUfunction,
       gridDimX, gridDimY, gridDimZ: uint32,
       blockDimX, blockDimY, blockDimZ: uint32,
       sharedMemBytes: uint32,
       stream: CUstream,
       kernelParams: ptr pointer,
       extra: ptr pointer
     ): CUresult

type
  CUmemAttach_flags* = enum
    CU_MEM_ATTACH_GLOBAL = 0x1, ## Memory can be accessed by any stream on any device
    CU_MEM_ATTACH_HOST = 0x2,   ## Memory cannot be accessed by any stream on any device
    CU_MEM_ATTACH_SINGLE = 0x4

proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult
proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult
proc cuMemFree*(devptr: CUdeviceptr): CUresult
proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult

{.pop.} # {.push cdecl, importc, dynlib: "libcuda.so".}

func cuModuleLoadData*(module: var CUmodule, sourceCode: openArray[char]): CUresult {.inline.}=
  cuModuleLoadData(module, sourceCode[0].unsafeAddr)
func cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: openArray[char]): CUresult {.inline.}=
  cuModuleGetFunction(kernel, module, fnName[0].unsafeAddr)

proc cudaDeviceInit(): CUdevice =

  check cuInit(0)

  var devCount: int32
  check cuDeviceGetCount(devCount)
  if devCount == 0:
    echo "cudaDeviceInit error: no devices supporting CUDA"
    quit 1

  var cuDevice: CUdevice
  check cuDeviceGet(cuDevice, 0)
  var name = newString(128)
  check cuDeviceGetName(name[0].addr, name.len.int32, cuDevice)
  echo "Using CUDA Device [0]: ", name

  var major, minor: int32
  check cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)
  check cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)
  echo "Compute Capability: SM ", major, ".", minor
  if major < 6:
    echo "Error: Device 0 is not sm_60 (Pascal generation, GTX 1080) or later"
    quit 1

  return cuDevice

# ############################################################
#
#                         NVVM
#
# ############################################################

{.passl: "-L/opt/cuda/nvvm/lib64 -lnvvm".}

type
  NvvmResult* {.size: sizeof(cint).} = enum
    NVVM_SUCCESS = 0
    NVVM_ERROR_OUT_OF_MEMORY = 1
    NVVM_ERROR_PROGRAM_CREATION_FAILURE = 2
    NVVM_ERROR_IR_VERSION_MISMATCH = 3
    NVVM_ERROR_INVALID_INPUT = 4
    NVVM_ERROR_INVALID_PROGRAM = 5
    NVVM_ERROR_INVALID_IR = 6
    NVVM_ERROR_INVALID_OPTION = 7
    NVVM_ERROR_NO_MODULE_IN_PROGRAM = 8
    NVVM_ERROR_COMPILATION = 9

  NvvmProgram = distinct pointer

{.push cdecl, importc, dynlib: "libnvvm.so".}

proc nvvmGetErrorString*(r: NvvmResult): cstring
proc nvvmVersion*(major, minor: var int32): NvvmResult
proc nvvmIRVersion*(majorIR, minorIR, majorDbg, minorDbg: var int32): NvvmResult

proc nvvmCreateProgram*(prog: var NvvmProgram): NvvmResult
proc nvvmDestroyProgram*(prog: var NvvmProgram): NvvmResult
proc nvvmAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult
proc nvvmLazyAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult
proc nvvmCompileProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
proc nvvmVerifyProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
proc nvvmGetCompiledResultSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult
proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult

{.pop.} # {.push cdecl, importc, header: "<nvvm.h>".}

# ############################################################
#
#                       NVVM IR
#
# ############################################################

proc tagCudaKernel*(module: ModuleRef, function: ValueRef) =
  ## Tag a function as a Cuda Kernel, i.e. callable from host

  # Upstream bug, getReturnType returns tkFunction for void functions.
  # doAssert function.getTypeOf().getReturnType().isVoid(), block:
  #   "Kernels must not return values but function returns " & $function.getTypeOf().getReturnType().getTypeKind()

  let ctx = module.getContext()
  module.addNamedMetadataOperand(
    "nvvm.annotations",
    ctx.asValueRef(ctx.metadataNode([
      function.asMetadataRef(),
      ctx.metadataNode("kernel"),
      constInt(ctx.int32_t(), 1, LlvmBool(false)).asMetadataRef()
    ]))
  )

# ############################################################
#
#                    Sanity Check
#
# ############################################################

when isMainModule:
  template check(status: NvvmResult) =
    let code = status # Assign so execution is done once only.
    if code != NVVM_SUCCESS:
      echo astToStr(status), " ", instantiationInfo(), " exited with error: ", code
      echo code.nvvmGetErrorString()
      quit 1

  echo "Nvidia JIT compiler sanity check"

  #######################################
  # Metadata

  const triple = "nvptx64-nvidia-cuda"
  var irVersion: tuple[major, minor, majorDbg, minorDbg: int32]
  block:
    var version: tuple[major, minor: int32]
    check: nvvmVersion(version.major, version.minor)
    echo "nvvm v", version.major, ".", version.minor
    check: nvvmIRVersion(irVersion.major, irVersion.minor, irVersion.majorDbg, irVersion.minorDbg)
    echo "requires LLVM IR v", irVersion.major, ".", irVersion.minor

  #######################################
  # LLVM IR codegen

  # Datalayout for NVVM IR 1.8 (CUDA 11.6)
  const datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

  let ctx = createContext()
  var module = ctx.createModule("test_nnvm")
  module.setTarget(triple)
  module.setDataLayout(datalayout)
  let i128 = ctx.int128_t()
  let void_t = ctx.void_t()

  let builder = ctx.createBuilder()

  block:
    let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
    let addKernel = module.addFunction("addKernel", addType)
    let blck = ctx.append_basic_block(addKernel, "addBody")
    builder.positionAtEnd(blck)
    let r = addKernel.getParam(0)
    let a = addKernel.getParam(1)
    let b = addKernel.getParam(2)
    let sum = builder.add(a, b, "sum")
    discard builder.store(sum, r)
    discard builder.retVoid()

    module.tagCudaKernel(addKernel)

  block:
    let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
    let mulKernel = module.addFunction("mulKernel", mulType)
    let blck = ctx.append_basic_block(mulKernel, "mulBody")
    builder.positionAtEnd(blck)
    let r = mulKernel.getParam(0)
    let a = mulKernel.getParam(1)
    let b = mulKernel.getParam(2)
    let prod = builder.mul(a, b, "prod")
    discard builder.store(prod, r)
    discard builder.retVoid()

    module.tagCudaKernel(mulKernel)

  block:
    var errMsg: cstring
    let errCode = module.verify(AbortProcessAction, errMsg)
    echo "LLVM verification: code ", int(errCode), ", message \"", errMsg, "\""
    errMsg.dispose()

  block:
    echo "================="
    echo "LLVM IR output"
    let ir = module.toIRString()
    echo ir
    ir.dispose()
    echo "================="

  #######################################
  # LLVM -> NNVM handover

  var prog{.noInit.}: NvvmProgram
  check nvvmCreateProgram(prog)

  # module.writeBitcodeToFile("arith.bc")
  let bitcode = module.toBitcode()
  check nvvmAddModuleToProgram(prog, bitcode, "arith")

  # Cleanup LLVM
  builder.dispose()
  module.dispose()
  ctx.dispose()

  #######################################
  # GPU codegen

  check nvvmVerifyProgram(prog, 0, nil)

  block:
    var logSize: csize_t
    check nvvmGetProgramLogSize(prog, logSize)
    var log = newString(logSize)
    check nvvmGetProgramLog(prog, log[0].addr)
    echo "log:"
    echo log
    echo "----------------"

  check nvvmCompileProgram(prog, 0, nil)
  var ptxSize: csize_t
  check nvvmGetCompiledResultSize(prog, ptxSize)
  var ptx = newString(ptxSize)
  check nvvmGetCompiledResult(prog, ptx[0].addr)

  block:
    var logSize: csize_t
    check nvvmGetProgramLogSize(prog, logSize)
    var log = newString(logSize)
    check nvvmGetProgramLog(prog, log[0].addr)
    echo "log:"
    echo log
    echo "----------------"

  check nvvmDestroyProgram(prog)

  echo "================="
  echo "PTX output"
  echo ptx
  echo "================="

  let cudaDevice = cudaDeviceInit()

  var cuCtx: CUcontext
  var cuMod: CUmodule
  var addKernel, mulKernel: CUfunction
  check cuCtxCreate(cuCtx, 0, cudaDevice)
  check cuModuleLoadData(cuMod, ptx)
  check cuModuleGetFunction(addKernel, cuMod, "addKernel")
  check cuModuleGetFunction(mulKernel, cuMod, "mulKernel")

  #######################################
  # Kernel launch

  func toHex*(a: uint64): string =
    const hexChars = "0123456789abcdef"
    const L = 2*sizeof(uint64)
    result = newString(L)
    var a = a
    for j in countdown(result.len-1, 2):
      result[j] = hexChars[a and 0xF]
      a = a shr 4

  func toString*(a: openArray[uint64]): string =
    result = "0x"
    for i in countdown(result.len-1, 0):
      result.add toHex(a[i])

  var r{.noInit.}, a, b: array[2, uint64]

  a[1] = 0x00000000000001FF'u64; a[0] = 0xFFFFFFFFFFFFFFFF'u64
  b[1] = 0x0000000000000000'u64; b[0] = 0x0010000000000000'u64

  echo "r:   ", r.toString()
  echo "a:   ", a.toString()
  echo "b:   ", b.toString()

  var rGPU: CUdeviceptr
  check cuMemAlloc(rGPU, csize_t sizeof(r))

  let params = [pointer(rGPU.addr), pointer(a.addr), pointer(b.addr)]

  check cuLaunchKernel(
          addKernel,
          1, 1, 1,
          1, 1, 1,
          0, CUstream(nil),
          params[0].unsafeAddr, nil)

  check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
  echo "a+b: ", r.toString()

  check cuLaunchKernel(
          mulKernel,
          1, 1, 1,
          1, 1, 1,
          0, CUstream(nil),
          params[0].unsafeAddr, nil)

  check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
  echo "a*b: ", r.toString()

  #######################################
  # Cleanup

  check cuMemFree(rGPU)
  rGPU = CUdeviceptr(nil)

  check cuModuleUnload(cuMod)
  cuMod = CUmodule(nil)

  check cuCtxDestroy(cuCtx)
  cuCtx = CUcontext(nil)