Skip to content

Instantly share code, notes, and snippets.

@mratsim
Created December 15, 2022 15:51
Show Gist options
  • Save mratsim/8929b7f9c414e39c34a365a7552c252c to your computer and use it in GitHub Desktop.
Save mratsim/8929b7f9c414e39c34a365a7552c252c to your computer and use it in GitHub Desktop.
JIT compile Cuda kernel in Nim
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
{.passc: gorge("llvm-config --cflags").}
{.passl: gorge("llvm-config --libs").}
# ############################################################
#
# Bindings to LLVM JIT
#
# ############################################################
# https://llvm.org/doxygen/group__LLVMC.html
# Constantine is a library. It is possible that applications relying on Constantine
# also link to libLLVM, for example if they implement a virtual machine (for the EVM, for Snarks/zero-knowledge, ...).
# Hence Constantine should always use LLVM context to "namespace" its own codegen and avoid collisions in the global context.
# ############################################################
#
# LLVM
#
# ############################################################
# TODO: by using the dynlib pragma (https://nim-lang.org/docs/manual.html#foreign-function-interface-dynlib-pragma-for-import)
# we wouldn't need the LLVM dev packages, only the runtime, significantly easing installation and install size.
# However there wouldn't be headers, openArray arguments would need a wrapper.
type
LlvmBool* = distinct int32
MemoryBufferRef* {.importc: "LLVMMemoryBufferRef", header: "<llvm-c/Core.h>".} = distinct pointer
ContextRef* {.importc: "LLVMContextRef", header: "<llvm-c/Core.h>".} = distinct pointer
ModuleRef* {.importc: "LLVMModuleRef", header: "<llvm-c/Core.h>".} = distinct pointer
TargetRef* {.importc: "LLVMTargetRef", header: "<llvm-c/Types.h>".} = distinct pointer
ExecutionEngineRef* {.importc: "LLVMExecutionEngineRef", header: "<llvm-c/ExecutionEngine.h>".} = distinct pointer
TypeRef* {.importc: "LLVMTypeRef", header: "<llvm-c/Core.h>".} = distinct pointer
ValueRef* {.importc: "LLVMValueRef", header: "<llvm-c/Core.h>".} = distinct pointer
NamedMDNodeRef* {.importc: "LLVMNamedMDNodeRef", header: "<llvm-c/Types.h>".} = distinct pointer
MetadataRef* {.importc: "LLVMMetadataRef", header: "<llvm-c/Types.h>".} = distinct pointer
{.push header: "<llvm-c/Core.h>".}
proc createContext*(): ContextRef {.importc: "LLVMContextCreate".}
proc dispose*(ctx: ContextRef) {.importc: "LLVMContextDispose".}
proc dispose*(msg: cstring) {.importc: "LLVMDisposeMessage".}
## cstring in LLVM are owned by LLVM and must be destroyed with a specific function
proc dispose*(buf: MemoryBufferRef){.importc: "LLVMDisposeMemoryBuffer".}
proc getBufferStart*(buf: MemoryBufferRef): ptr byte {.importc: "LLVMGetBufferStart".}
proc getBufferSize*(buf: MemoryBufferRef): csize_t {.importc: "LLVMGetBufferSize".}
{.pop.} # {.push header: "<llvm-c/Core.h>".}
# ############################################################
#
# Module
#
# ############################################################
{.push header: "<llvm-c/Core.h>".}
proc llvmCreateModule(name: cstring, ctx: ContextRef): ModuleRef {.importc: "LLVMModuleCreateWithNameInContext".}
template createModule*(ctx: ContextRef, name: cstring): ModuleRef =
llvmCreateModule(name, ctx)
proc dispose*(m: ModuleRef) {.importc: "LLVMDisposeModule".}
## Destroys a module
## Note: destroying an Execution Engine will also destroy modules attached to it
proc toIRString*(m: ModuleRef): cstring {.importc: "LLVMPrintModuleToString".}
## Print a module IR to textual IR string. The string must be disposed with LLVM "dispose" or memory will leak.
proc getContext*(m: ModuleRef): ContextRef {.importc: "LLVMGetModuleContext".}
proc getOrInsertNamedMetadata*(m: ModuleRef, name: openArray[char]): NamedMDNodeRef {.importc: "LLVMGetOrInsertNamedMetadata".}
proc addNamedMetadataOperand*(m: ModuleRef, name: cstring, val: ValueRef) {.importc: "LLVMAddNamedMetadataOperand".}
proc metadataNode*(ctx: ContextRef, metadataNodes: openarray[MetadataRef]): MetadataRef {.importc: "LLVMMDNodeInContext2".}
proc metadataNode*(ctx: ContextRef, str: openArray[char]): MetadataRef {.importc: "LLVMMDStringInContext2".}
proc asMetadataRef*(val: ValueRef): MetadataRef {.importc: "LLVMValueAsMetadata".}
proc asValueRef*(ctx: ContextRef, md: MetadataRef): ValueRef {.importc: "LLVMMetadataAsValue".}
{.pop.} # {.push header: "<llvm-c/Core.h>".}
{.push header: "<llvm-c/BitWriter.h>".}
proc writeBitcodeToFile*(m: ModuleRef, path: cstring) {.importc: "LLVMWriteBitcodeToFile".}
proc writeBitcodeToMemoryBuffer*(m: ModuleRef): MemoryBufferRef {.importc: "LLVMWriteBitcodeToMemoryBuffer".}
## Write bitcode to a memory buffer
## The MemoryBuffer must be disposed appropriately or memory will leak
{.pop.} # {.push header: "<llvm-c/BitWriter.h>".}
proc toBitcode*(m: ModuleRef): seq[byte] =
## Print a module IR to bitcode
let mb = m.writeBitcodeToMemoryBuffer()
let len = mb.getBufferSize()
result.newSeq(len)
copyMem(result[0].addr, mb.getBufferStart(), len)
mb.dispose()
type VerifierFailureAction* {.size: sizeof(cint).} = enum
AbortProcessAction # verifier will print to stderr and abort()
PrintMessageAction # verifier will print to stderr and return 1
ReturnStatusAction # verifier will just return 1
{.push header: "<llvm-c/Analysis.h>".}
proc verify*(module: ModuleRef, failureAction: VerifierFailureAction, msg: var cstring): LlvmBool {.importc: "LLVMVerifyModule".}
proc verify*(fn: ValueRef, failureAction: VerifierFailureAction): LlvmBool {.importc: "LLVMVerifyFunction".}
{.pop.}
# ############################################################
#
# Target
#
# ############################################################
{.push header: "<llvm-c/Target.h>".}
proc initializeNativeTarget*(): LlvmBool {.discardable, importc: "LLVMInitializeNativeTarget".}
proc initializeNativeAsmPrinter*(): LlvmBool {.discardable, importc: "LLVMInitializeNativeAsmPrinter".}
proc getTargetFromName*(name: cstring): TargetRef {.importc: "LLVMGetTargetFromName".}
{.pop.}
{.push header: "<llvm-c/Core.h>".}
proc setTarget*(module: ModuleRef, triple: cstring) {.importc: "LLVMSetTarget".}
proc setDataLayout*(module: ModuleRef, layout: cstring) {.importc: "LLVMSetDataLayout".}
{.pop.}
# ############################################################
#
# Execution Engine
#
# ############################################################
{.push header: "<llvm-c/ExecutionEngine.h>".}
proc linkInMCJIT*() {.importc: "LLVMLinkInMCJIT".}
proc createJITCompilerForModule*(
engine: var ExecutionEngineRef,
module: ModuleRef,
optLevel: uint32,
err: var cstring): LlvmBool {.importc: "LLVMCreateJITCompilerForModule".}
proc remove*(
engine: ExecutionEngineRef,
m: ModuleRef,
outMod: var ModuleRef,
err: var cstring): LlvmBool {.importc: "LLVMRemoveModule".}
proc dispose*(engine: ExecutionEngineRef) {.importc: "LLVMDisposeExecutionEngine".}
## Destroys an execution engine
## Note: destroying an Execution Engine will also destroy modules attached to it
proc getFunctionAddress*(engine: ExecutionEngineRef, name: cstring): distinct pointer {.importc: "LLVMGetFunctionAddress".}
{.pop}
# ############################################################
#
# Types
#
# ############################################################
# https://llvm.org/doxygen/group__LLVMCCoreType.html
type
TypeKind* {.size: sizeof(cint).} = enum
tkVoid, ## type with no size
tkHalf, ## 16 bit floating point type
tkFloat, ## 32 bit floating point type
tkDouble, ## 64 bit floating point type
tkX86_FP80, ## 80 bit floating point type (X87)
tkFP128, ## 128 bit floating point type (112-bit mantissa)
tkPPC_FP128, ## 128 bit floating point type (two 64-bits)
tkLabel, ## Labels
tkInteger, ## Arbitrary bit width integers
tkFunction, ## Functions
tkStruct, ## Structures
tkArray, ## Arrays
tkPointer, ## Pointers
tkVector, ## Fixed width SIMD vector type
tkMetadata, ## Metadata
tkX86_MMX, ## X86 MMX
tkToken, ## Tokens
tkScalableVector, ## Scalable SIMD vector type
tkBFloat, ## 16 bit brain floating point type
tkX86_AMX ## X86 AMX
{.push header: "<llvm-c/Core.h>".}
proc getTypeKind*(ty: TypeRef): TypeKind {.importc: "LLVMGetTypeKind".}
proc void_t*(ctx: ContextRef): TypeRef {.importc: "LLVMVoidTypeInContext".}
# Integers
# ------------------------------------------------------------
proc int1_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt1TypeInContext".}
proc int8_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt8TypeInContext".}
proc int16_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt16TypeInContext".}
proc int32_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt32TypeInContext".}
proc int64_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt64TypeInContext".}
proc int128_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt128TypeInContext".}
proc int_t*(ctx: ContextRef, numBits: uint32): TypeRef {.importc: "LLVMIntTypeInContext".}
# Composite
# ------------------------------------------------------------
proc struct_t*(
ctx: ContextRef,
elemTypes: openArray[TypeRef], # requires implicit conversion of the length to uint32, which requires the header
packed: LlvmBool): TypeRef {.importc: "LLVMStructTypeInContext".}
proc array_t*(elemType: TypeRef, elemCount: uint32): TypeRef {.importc: "LLVMArrayType".}
proc pointerType(elementType: TypeRef; addressSpace: cuint): TypeRef {.importc: "LLVMPointerType".}
# Functions
# ------------------------------------------------------------
proc function_t*(
returnType: TypeRef,
paramTypes: openArray[TypeRef], # requires implicit conversion of the length to uint32, which requires the header
isVarArg: LlvmBool): TypeRef {.importc: "LLVMFunctionType".}
proc addFunction*(m: ModuleRef, name: cstring, ty: TypeRef): ValueRef {.importc: "LLVMAddFunction".}
## Declare a function `name` in a module.
## Returns a handle to specify its instructions
proc printTypeToString(ty: TypeRef): cstring {.importc: "LLVMPrintTypeToString".}
# TODO: Function and Parameter attributes:
# - https://www.llvm.org/docs/LangRef.html?highlight=attribute#function-attributes
# - https://www.llvm.org/docs/LangRef.html?highlight=attribute#parameter-attributes
#
# We can use attributes to specify additional guarantees of Constantine code, for instance:
# - "pure" function with: nounwind, readonly
# - pointer particularities: readonly, writeonly, noalias, inalloca, byval
proc getReturnType*(functionTy: TypeRef): TypeRef {.importc: "LLVMGetReturnType".}
{.pop.} # {.push header: "<llvm-c/Core.h>".}
# ------------------------------
proc `$`*(ty: TypeRef): string =
let s = ty.printTypeToString()
result = $s
s.dispose()
proc isVoid*(ty: TypeRef): bool {.inline.} =
ty.getTypeKind == tkVoid
proc pointer_t*(elementTy: TypeRef): TypeRef {.inline.} =
pointerType(elementTy, addressSpace = 0)
# ############################################################
#
# Values
#
# ############################################################
{.push header: "<llvm-c/Core.h>".}
proc getTypeOf*(x: ValueRef): TypeRef {.importc: "LLVMTypeOf".}
# Constants
# ------------------------------------------------------------
# https://llvm.org/doxygen/group__LLVMCCoreValueConstant.html
proc constInt*(ty: TypeRef, n: culonglong, signExtend: LlvmBool): ValueRef {.importc: "LLVMConstInt".}
proc constReal*(ty: TypeRef, n: cdouble): ValueRef {.importc: "LLVMConstReal".}
proc constNull*(ty: TypeRef): ValueRef {.importc: "LLVMConstNull".}
proc constAllOnes*(ty: TypeRef): ValueRef {.importc: "LLVMConstAllOnes".}
proc constStruct*(
vals: openArray[ValueRef], # requires implicit conversion of the length to uint32, which requires the header
packed: LlvmBool): ValueRef {.importc: "LLVMConstStruct".}
proc constArray*(
ty: TypeRef,
constantVals: openArray[ValueRef] # requires implicit conversion of the length to uint32, which requires the header
): ValueRef {.importc: "LLVMConstArray".}
{.pop.} # {.push header: "<llvm-c/Core.h>".}
# ############################################################
#
# IR builder
#
# ############################################################
# https://llvm.org/doxygen/group__LLVMCCoreInstructionBuilder.html
type
BasicBlockRef* {.importc: "LLVMBasicBlockRef", header: "<llvm-c/Core.h>".} = distinct pointer
BuilderRef* {.importc: "LLVMBuilderRef", header: "<llvm-c/Core.h>".} = distinct pointer
## An instruction builder represents a point within a basic block and is
## the exclusive means of building instructions using the C interface.
IntPredicate* {.size: sizeof(cint).} = enum
IntEQ = 32, ## equal
IntNE, ## not equal
IntUGT, ## unsigned greater than
IntUGE, ## unsigned greater or equal
IntULT, ## unsigned less than
IntULE, ## unsigned less or equal
IntSGT, ## signed greater than
IntSGE, ## signed greater or equal
IntSLT, ## signed less than
IntSLE ## signed less or equal
{.push header: "<llvm-c/Core.h>".}
# Instantiation
# ------------------------------------------------------------
proc appendBasicBlock*(ctx: ContextRef, fn: ValueRef, name: cstring): BasicBlockRef {.importc: "LLVMAppendBasicBlockInContext".}
## Append a basic block to the end of a function
proc createBuilder*(ctx: ContextRef): BuilderRef {.importc: "LLVMCreateBuilderInContext".}
proc dispose*(builder: BuilderRef) {.importc: "LLVMDisposeBuilder".}
# Functions
# ------------------------------------------------------------
proc getParam*(fn: ValueRef, index: uint32): ValueRef {.importc: "LLVMGetParam".}
proc retVoid*(builder: BuilderRef): ValueRef {.importc: "LLVMBuildRetVoid".}
proc ret*(builder: BuilderRef, returnVal: ValueRef) {.importc: "LLVMBuildRet".}
# Positioning
# ------------------------------------------------------------
proc position*(builder: BuilderRef, blck: BasicBlockRef, instr: ValueRef) {.importc: "LLVMPositionBuilder".}
proc positionBefore*(builder: BuilderRef, instr: ValueRef) {.importc: "LLVMPositionBuilderBefore".}
proc positionAtEnd*(builder: BuilderRef, blck: BasicBlockRef) {.importc: "LLVMPositionBuilderAtEnd".}
# Intermediate Representation
# ------------------------------------------------------------
#
# - NSW: no signed wrap, signed value cannot over- or underflow.
# - NUW: no unsigned wrap, unsigned value cannot over- or underflow.
proc add*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildAdd".}
proc addNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWAdd".}
proc addNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWAdd".}
proc sub*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildSub".}
proc subNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWSub".}
proc subNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWSub".}
proc neg*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNeg".}
proc negNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWNeg".}
proc negNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWNeg".}
proc mul*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildMul".}
proc mulNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWMul".}
proc mulNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWMul".}
proc divU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildUDiv".}
proc divU_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildExactUDiv".}
proc divS*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildSDiv".}
proc divS_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildExactSDiv".}
proc remU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildURem".}
proc remS*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildSRem".}
proc lshl*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildShl".}
proc lshr*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildLShr".}
proc ashr*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildAShr".}
proc `and`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildAnd".}
proc `or`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildOr".}
proc `xor`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildXor".}
proc `not`*(builder: BuilderRef, val: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNot".}
proc select*(builder: BuilderRef, condition, then, otherwise: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNot".}
proc icmp*(builder: BuilderRef, op: IntPredicate, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildICmp".}
proc bitcast*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildBitcast".}
proc trunc*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildTrunc".}
proc zext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildZExt".}
## Zero-extend
proc sext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildSExt".}
## Sign-extend
proc malloc*(builder: BuilderRef, ty: TypeRef): ValueRef {.importc: "LLVMBuildMalloc".}
proc mallocArray*(builder: BuilderRef, ty: TypeRef, val: ValueRef): ValueRef {.importc: "LLVMBuildMallocArray".}
proc free*(builder: BuilderRef, ty: TypeRef, `ptr`: ValueRef): ValueRef {.importc: "LLVMBuildFree".}
proc alloca*(builder: BuilderRef, ty: TypeRef): ValueRef {.importc: "LLVMBuildAlloca".}
proc allocaArray*(builder: BuilderRef, ty: TypeRef, val: ValueRef): ValueRef {.importc: "LLVMBuildAllocaArray".}
proc getElementPtr2*(
builder: BuilderRef,
ty: TypeRef,
`ptr`: ValueRef,
indices: openArray[ValueRef], # requires implicit conversion of the length to uint32, which requires the header
name: cstring
): ValueRef {.importc: "LLVMBuildGEP2".}
## https://www.llvm.org/docs/GetElementPtr.html
proc getElementPtr2_InBounds*(
builder: BuilderRef,
ty: TypeRef,
`ptr`: ValueRef,
indices: openArray[ValueRef], # requires implicit conversion of the length to uint32, which requires the header
name: cstring
): ValueRef {.importc: "LLVMBuildInBoundsGEP2".}
## https://www.llvm.org/docs/GetElementPtr.html
## If the GEP lacks the inbounds keyword, the value is the result from evaluating the implied two’s complement integer computation.
## However, since there’s no guarantee of where an object will be allocated in the address space, such values have limited meaning.
proc getElementPtr2_Struct*(
builder: BuilderRef,
ty: TypeRef,
`ptr`: ValueRef,
idx: uint32,
name: cstring
): ValueRef {.importc: "LLVMBuildStructGEP2".}
## https://www.llvm.org/docs/GetElementPtr.html
## If the GEP lacks the inbounds keyword, the value is the result from evaluating the implied two’s complement integer computation.
## However, since there’s no guarantee of where an object will be allocated in the address space, such values have limited meaning.
proc load2*(builder: BuilderRef, ty: TypeRef, `ptr`: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildLoad2".}
proc store*(builder: BuilderRef, val, `ptr`: ValueRef): ValueRef {.importc: "LLVMBuildStore".}
proc memset*(builder: BuilderRef, `ptr`, val, len: ValueRef, align: uint32) {.importc: "LLVMBuildMemset".}
proc memcpy*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemcpy".}
proc memmove*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemmove".}
{.pop.} # {.push header: "<llvm-c/Core.h>".}
# ############################################################
#
# Sanity Check
#
# ############################################################
when isMainModule:
echo "LLVM JIT compiler sanity check"
let ctx = createContext()
var module = ctx.createModule("addition")
let i32 = ctx.int32_t()
let addType = function_t(i32, [i32, i32], isVarArg = LlvmBool(false))
let addBody = module.addFunction("add", addType)
let builder = ctx.createBuilder()
let blck = ctx.append_basic_block(addBody, "addBody")
builder.positionAtEnd(blck)
block:
let a = addBody.getParam(0)
let b = addBody.getParam(1)
let sum = builder.add(a, b, "sum")
builder.ret(sum)
block:
var errMsg: cstring
let errCode = module.verify(AbortProcessAction, errMsg)
echo "Verification: code ", int(errCode), ", message \"", errMsg, "\""
errMsg.dispose()
var engine: ExecutionEngineRef
block:
let errCode = initializeNativeTarget()
echo "Target init: code ", int(errCode)
let errCodeASMPrinter = initializeNativeAsmPrinter() # Why is this necessary? Otherwise we get "LLVM ERROR: Target does not support MC emission!"
echo "ASM printer init: code ", int(errCodeASMPrinter)
var errMsg: cstring
if bool createJITCompilerForModule(engine, module, optLevel = 0, errMsg):
if errMsg.len > 0:
echo errMsg
echo "exiting ..."
else:
echo "JIT compiler: error without details ... exiting"
quit 1
let jitAdd = cast[proc(a, b: int32): int32 {.noconv.}](
engine.getFunctionAddress("add"))
echo "jitAdd(1, 2) = ", jitAdd(1, 2)
doAssert jitAdd(1, 2) == 1 + 2
block:
# Cleanup
# Note: when disposing the Execution Engine, attached modules are also disposed.
# here we go the extra mile of detaching the module for testing.
builder.dispose()
var errMsg: cstring
let errCode = engine.remove(module, module, errMsg)
echo "Detaching module from Execution Engine: code ", int(errCode), ", message \"", errMsg, "\""
module.dispose()
engine.dispose()
ctx.dispose()
echo "LLVM JIT - SUCCESS"
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# ############################################################
#
# Bindings to Nvidia GPUs libraries
#
# ############################################################
import ./llvm
type Flag*[E: enum] = distinct cint
func flag*[E: enum](e: varargs[E]): Flag[E] {.inline.} =
## Enum should only have power of 2 fields
# static:
# for val in E:
# assert (ord(val) and (ord(val) - 1)) == 0, "Enum values should all be power of 2, found " &
# $val & " with value " & $ord(val) & "."
var flags = 0
for val in e:
flags = flags or ord(val)
result = Flag[E](flags)
# ############################################################
#
# Cuda
#
# ############################################################
{.passl: "-L/opt/cuda/lib64 -lcuda".}
# Cuda offers 2 APIs:
# - cuda.h the driver API
# - cuda_runtime.h the runtime API
#
# https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html
#
# We need to use the lower-level driver API for JIT modules loading and reloading
type
CUresult* {.size: sizeof(cint).} = enum
## The API call returned with no errors. In the case of query calls, this
## also means that the operation being queried is complete (see
## ::cuEventQuery() and ::cuStreamQuery()).
CUDA_SUCCESS = 0
## This indicates that one or more of the parameters passed to the API call
## is not within an acceptable range of values.
CUDA_ERROR_INVALID_VALUE = 1
## The API call failed because it was unable to allocate enough memory to
## perform the requested operation.
CUDA_ERROR_OUT_OF_MEMORY = 2
## This indicates that the CUDA driver has not been initialized with
## ::cuInit() or that initialization has failed.
CUDA_ERROR_NOT_INITIALIZED = 3
## This indicates that the CUDA driver is in the process of shutting down.
CUDA_ERROR_DEINITIALIZED = 4
## This indicates profiler is not initialized for this run. This can
## happen when the application is running with external profiling tools
## like visual profiler.
CUDA_ERROR_PROFILER_DISABLED = 5
## to attempt to enable/disable the profiling via ::cuProfilerStart or
## ::cuProfilerStop without initialization.
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
## to call cuProfilerStart() when profiling is already enabled.
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
## to call cuProfilerStop() when profiling is already disabled.
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
## This indicates that the CUDA driver that the application has loaded is a
## stub library. Applications that run with the stub rather than a real
## driver loaded will result in CUDA API returning this error.
CUDA_ERROR_STUB_LIBRARY = 34
## This indicates that requested CUDA device is unavailable at the current
## time. Devices are often unavailable due to use of
## ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.
CUDA_ERROR_DEVICE_UNAVAILABLE = 46
## This indicates that no CUDA-capable devices were detected by the installed
## CUDA driver.
CUDA_ERROR_NO_DEVICE = 100
## This indicates that the device ordinal supplied by the user does not
## correspond to a valid CUDA device or that the action requested is
## invalid for the specified device.
CUDA_ERROR_INVALID_DEVICE = 101
## This error indicates that the Grid license is not applied.
CUDA_ERROR_DEVICE_NOT_LICENSED = 102
## This indicates that the device kernel image is invalid. This can also
## indicate an invalid CUDA module.
CUDA_ERROR_INVALID_IMAGE = 200
## This most frequently indicates that there is no context bound to the
## current thread. This can also be returned if the context passed to an
## API call is not a valid handle (such as a context that has had
## ::cuCtxDestroy() invoked on it). This can also be returned if a user
## mixes different API versions (i.e. 3010 context with 3020 API calls).
## See ::cuCtxGetApiVersion() for more details.
CUDA_ERROR_INVALID_CONTEXT = 201
## This indicated that the context being supplied as a parameter to the
## API call was already the active context.
## error to attempt to push the active context via ::cuCtxPushCurrent().
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
## This indicates that a map or register operation has failed.
CUDA_ERROR_MAP_FAILED = 205
## This indicates that an unmap or unregister operation has failed.
CUDA_ERROR_UNMAP_FAILED = 206
## This indicates that the specified array is currently mapped and thus
## cannot be destroyed.
CUDA_ERROR_ARRAY_IS_MAPPED = 207
## This indicates that the resource is already mapped.
CUDA_ERROR_ALREADY_MAPPED = 208
## This indicates that there is no kernel image available that is suitable
## for the device. This can occur when a user specifies code generation
## options for a particular CUDA source file that do not include the
## corresponding device configuration.
CUDA_ERROR_NO_BINARY_FOR_GPU = 209
## This indicates that a resource has already been acquired.
CUDA_ERROR_ALREADY_ACQUIRED = 210
## This indicates that a resource is not mapped.
CUDA_ERROR_NOT_MAPPED = 211
## This indicates that a mapped resource is not available for access as an
## array.
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
## This indicates that a mapped resource is not available for access as a
## pointer.
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
## This indicates that an uncorrectable ECC error was detected during
## execution.
CUDA_ERROR_ECC_UNCORRECTABLE = 214
## This indicates that the ::CUlimit passed to the API call is not
## supported by the active device.
CUDA_ERROR_UNSUPPORTED_LIMIT = 215
## This indicates that the ::CUcontext passed to the API call can
## only be bound to a single CPU thread at a time but is already
## bound to a CPU thread.
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
## This indicates that peer access is not supported across the given
## devices.
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
## This indicates that a PTX JIT compilation failed.
CUDA_ERROR_INVALID_PTX = 218
## This indicates an error with OpenGL or DirectX context.
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
## This indicates that an uncorrectable NVLink error was detected during the
## execution.
CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
## This indicates that the PTX JIT compiler library was not found.
CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
## This indicates that the provided PTX was compiled with an unsupported toolchain.
CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
## This indicates that the PTX JIT compilation was disabled.
CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
## This indicates that the ::CUexecAffinityType passed to the API call is not
## supported by the active device.
CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
## This indicates that the device kernel source is invalid. This includes
## compilation/linker errors encountered in device code or user error.
CUDA_ERROR_INVALID_SOURCE = 300
## This indicates that the file specified was not found.
CUDA_ERROR_FILE_NOT_FOUND = 301
## This indicates that a link to a shared object failed to resolve.
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
## This indicates that initialization of a shared object failed.
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
## This indicates that an OS call failed.
CUDA_ERROR_OPERATING_SYSTEM = 304
## This indicates that a resource handle passed to the API call was not
## valid. Resource handles are opaque types like ::CUstream and ::CUevent.
CUDA_ERROR_INVALID_HANDLE = 400
## This indicates that a resource required by the API call is not in a
## valid state to perform the requested operation.
CUDA_ERROR_ILLEGAL_STATE = 401
## This indicates that a named symbol was not found. Examples of symbols
## are global/constant variable names, driver function names, texture names,
## and surface names.
CUDA_ERROR_NOT_FOUND = 500
## This indicates that asynchronous operations issued previously have not
## completed yet. This result is not actually an error, but must be indicated
## differently than ::CUDA_SUCCESS (which indicates completion). Calls that
## may return this value include ::cuEventQuery() and ::cuStreamQuery().
CUDA_ERROR_NOT_READY = 600
## While executing a kernel, the device encountered a
## load or store instruction on an invalid memory address.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_ILLEGAL_ADDRESS = 700
## This indicates that a launch did not occur because it did not have
## appropriate resources. This error usually indicates that the user has
## attempted to pass too many arguments to the device kernel, or the
## kernel launch specifies too many threads for the kernel's register
## count. Passing arguments of the wrong size (i.e. a 64-bit pointer
## when a 32-bit int is expected) is equivalent to passing too many
## arguments and can also result in this error.
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
## This indicates that the device kernel took too long to execute. This can
## only occur if timeouts are enabled - see the device attribute
## ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_LAUNCH_TIMEOUT = 702
## This error indicates a kernel launch that uses an incompatible texturing
## mode.
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
## This error indicates that a call to ::cuCtxEnablePeerAccess() is
## trying to re-enable peer access to a context which has already
## had peer access to it enabled.
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
## This error indicates that ::cuCtxDisablePeerAccess() is
## trying to disable peer access which has not been enabled yet
## via ::cuCtxEnablePeerAccess().
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
## This error indicates that the primary context for the specified device
## has already been initialized.
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
## This error indicates that the context current to the calling thread
## has been destroyed using ::cuCtxDestroy, or is a primary context which
## has not yet been initialized.
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
## A device-side assert triggered during kernel execution. The context
## cannot be used anymore, and must be destroyed. All existing device
## memory allocations from this context are invalid and must be
## reconstructed if the program is to continue using CUDA.
CUDA_ERROR_ASSERT = 710
## This error indicates that the hardware resources required to enable
## peer access have been exhausted for one or more of the devices
## passed to ::cuCtxEnablePeerAccess().
CUDA_ERROR_TOO_MANY_PEERS = 711
## This error indicates that the memory range passed to ::cuMemHostRegister()
## has already been registered.
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
## This error indicates that the pointer passed to ::cuMemHostUnregister()
## does not correspond to any currently registered memory region.
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
## While executing a kernel, the device encountered a stack error.
## This can be due to stack corruption or exceeding the stack size limit.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_HARDWARE_STACK_ERROR = 714
## While executing a kernel, the device encountered an illegal instruction.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
## While executing a kernel, the device encountered a load or store instruction
## on a memory address which is not aligned.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_MISALIGNED_ADDRESS = 716
## While executing a kernel, the device encountered an instruction
## which can only operate on memory locations in certain address spaces
## (global, shared, or local), but was supplied a memory address not
## belonging to an allowed address space.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
## While executing a kernel, the device program counter wrapped its address space.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_INVALID_PC = 718
## An exception occurred on the device while executing a kernel. Common
## causes include dereferencing an invalid device pointer and accessing
## out of bounds shared memory. Less common cases can be system specific - more
## information about these cases can be found in the system specific user guide.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_LAUNCH_FAILED = 719
## This error indicates that the number of blocks launched per grid for a kernel that was
## launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
## exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
## or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
## as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
## This error indicates that the attempted operation is not permitted.
CUDA_ERROR_NOT_PERMITTED = 800
## This error indicates that the attempted operation is not supported
## on the current system or device.
CUDA_ERROR_NOT_SUPPORTED = 801
## This error indicates that the system is not yet ready to start any CUDA
## work. To continue using CUDA, verify the system configuration is in a
## valid state and all required driver daemons are actively running.
## More information about this error can be found in the system specific
## user guide.
CUDA_ERROR_SYSTEM_NOT_READY = 802
## This error indicates that there is a mismatch between the versions of
## the display driver and the CUDA driver. Refer to the compatibility documentation
## for supported versions.
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
## This error indicates that the system was upgraded to run with forward compatibility
## but the visible hardware detected by CUDA does not support this configuration.
## Refer to the compatibility documentation for the supported hardware matrix or ensure
## that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
## environment variable.
CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
## This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
CUDA_ERROR_MPS_CONNECTION_FAILED = 805
## This error indicates that the remote procedural call between the MPS server and the MPS client failed.
CUDA_ERROR_MPS_RPC_FAILURE = 806
## This error indicates that the MPS server is not ready to accept new MPS client requests.
## This error can be returned when the MPS server is in the process of recovering from a fatal failure.
CUDA_ERROR_MPS_SERVER_NOT_READY = 807
## This error indicates that the hardware resources required to create MPS client have been exhausted.
CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
## This error indicates the the hardware resources required to support device connections have been exhausted.
CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
## This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
## This error indicates that the operation is not permitted when
## the stream is capturing.
CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
## This error indicates that the current capture sequence on the stream
## has been invalidated due to a previous error.
CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
## This error indicates that the operation would have resulted in a merge
## of two independent capture sequences.
CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
## This error indicates that the capture was not initiated in this stream.
CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
## This error indicates that the capture sequence contains a fork that was
## not joined to the primary stream.
CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
## This error indicates that a dependency would have been created which
## crosses the capture sequence boundary. Only implicit in-stream ordering
## dependencies are allowed to cross the boundary.
CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
## This error indicates a disallowed implicit dependency on a current capture
## sequence from cudaStreamLegacy.
CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
## This error indicates that the operation is not permitted on an event which
## was last recorded in a capturing stream.
CUDA_ERROR_CAPTURED_EVENT = 907
## A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
## argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
## different thread.
CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
## This error indicates that the timeout specified for the wait operation has lapsed.
CUDA_ERROR_TIMEOUT = 909
## This error indicates that the graph update was not performed because it included
## changes which violated constraints specific to instantiated graph update.
CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
## This indicates that an async error has occurred in a device outside of CUDA.
## If CUDA was waiting for an external device's signal before consuming shared data,
## the external device signaled an error indicating that the data is not valid for
## consumption. This leaves the process in an inconsistent state and any further CUDA
## work will return the same error. To continue using CUDA, the process must be
## terminated and relaunched.
CUDA_ERROR_EXTERNAL_DEVICE = 911
## Indicates a kernel launch error due to cluster misconfiguration.
CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
## This indicates that an unknown internal error has occurred.
CUDA_ERROR_UNKNOWN = 999
template check*(status: CUresult) =
## Check the status code of a CUDA operation
## Exit program with error if failure
let code = status # ensure that the input expression is evaluated once only
if code != CUDA_SUCCESS:
echo astToStr(status), " ", instantiationInfo(), " exited with error: ", code
echo $code
quit 1
type
CUdevice* = distinct int32
## Compute Device handle
CUdevice_attribute* {.size: sizeof(cint).} = enum
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, ## Maximum number of threads per block */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, ## Maximum block dimension X */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, ## Maximum block dimension Y */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, ## Maximum block dimension Z */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, ## Maximum grid dimension X */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, ## Maximum grid dimension Y */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, ## Maximum grid dimension Z */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, ## Maximum shared memory available per block in bytes */
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, ## Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, ## Warp size in threads */
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, ## Maximum pitch in bytes allowed by memory copies */
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ## Maximum number of 32-bit registers available per block */
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ## Typical clock frequency in kilohertz */
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ## Alignment requirement for textures */
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ## Number of multiprocessors on device */
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ## Specifies whether there is a run time limit on kernels */
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ## Device is integrated with host memory */
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ## Device can map host memory into CUDA address space */
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, ## Compute mode (See ::CUcomputemode for details) */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, ## Maximum 1D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ## Maximum 2D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, ## Maximum 2D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ## Maximum 3D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, ## Maximum 3D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ## Maximum 3D texture depth */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, ## Maximum 2D layered texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, ## Maximum 2D layered texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, ## Maximum layers in a 2D layered texture */
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ## Alignment requirement for surfaces */
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ## Device can possibly execute multiple kernels concurrently */
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, ## Device has ECC support enabled */
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, ## PCI bus ID of the device */
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, ## PCI device ID of the device */
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, ## Device is using TCC driver model */
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, ## Peak memory clock frequency in kilohertz */
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, ## Global memory bus width in bits */
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, ## Size of L2 cache in bytes */
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, ## Maximum resident threads per multiprocessor */
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, ## Number of asynchronous engines */
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, ## Device shares a unified address space with the host */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, ## Maximum 1D layered texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, ## Maximum layers in a 1D layered texture */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, ## Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, ## Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, ## Alternate maximum 3D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, ## Alternate maximum 3D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, ## Alternate maximum 3D texture depth */
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, ## PCI domain ID of the device */
CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, ## Pitch alignment requirement for textures */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, ## Maximum cubemap texture width/height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, ## Maximum cubemap layered texture width/height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, ## Maximum layers in a cubemap layered texture */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, ## Maximum 1D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, ## Maximum 2D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, ## Maximum 2D surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, ## Maximum 3D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, ## Maximum 3D surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, ## Maximum 3D surface depth */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, ## Maximum 1D layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, ## Maximum layers in a 1D layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, ## Maximum 2D layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, ## Maximum 2D layered surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, ## Maximum layers in a 2D layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, ## Maximum cubemap surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, ## Maximum cubemap layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, ## Maximum layers in a cubemap layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, ## Maximum 2D linear texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, ## Maximum 2D linear texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, ## Maximum 2D linear texture pitch in bytes */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, ## Maximum mipmapped 2D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, ## Maximum mipmapped 2D texture height */
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, ## Major compute capability version number */
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, ## Minor compute capability version number */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, ## Maximum mipmapped 1D texture width */
CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, ## Device supports stream priorities */
CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, ## Device supports caching globals in L1 */
CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, ## Device supports caching locals in L1 */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, ## Maximum shared memory available per multiprocessor in bytes */
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, ## Maximum number of 32-bit registers available per multiprocessor */
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, ## Device can allocate managed memory on this system */
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, ## Device is on a multi-GPU board */
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, ## Unique id for a group of devices on the same multi-GPU board */
CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, ## Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, ## Ratio of single precision performance (in floating-point operations per second) to double precision performance */
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, ## Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, ## Device can coherently access managed memory concurrently with the CPU */
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, ## Device supports compute preemption. */
CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, ## Device can access host registered memory at the same virtual address as the CPU */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, ## ::cuStreamBatchMemOp and related APIs are supported. */
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, ## 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, ## ::CU_STREAM_WAIT_VALUE_NOR is supported. */
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, ## Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, ## Maximum optin shared memory per block */
CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, ## The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, ## Device supports host memory registration via ::cudaHostRegister. */
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, ## Device accesses pageable memory via the host's page tables. */
CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, ## The host can directly access managed memory on the device without migration. */
CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, ## Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, ## Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, ## Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, ## Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, ## Maximum number of blocks per multiprocessor */
CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, ## Device supports compression of memory */
CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, ## Maximum L2 persisting lines capacity setting in bytes. */
CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, ## Maximum value of CUaccessPolicyWindow::num_bytes. */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, ## Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, ## Shared memory reserved by CUDA driver per block in bytes */
CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, ## Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, ## Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, ## External timeline semaphore interop is supported on the device */
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, ## Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, ## Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, ## The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, ## GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, ## Handle types supported with mempool based IPC */
CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120, ## Indicates device supports cluster launch */
CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121, ## Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 = 122, ## 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and related v2 MemOp APIs. */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 = 123, ## ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124, ## Device supports buffer sharing with dma_buf mechanism. */
CU_DEVICE_ATTRIBUTE_MAX
CUcontext* = distinct pointer
CUmodule* = distinct pointer
CUfunction* = distinct pointer
CUstream* = distinct pointer
CUdeviceptr* = distinct pointer
{.push cdecl, importc, dynlib: "libcuda.so".}
proc cuInit*(flags: uint32): CUresult
proc cuDeviceGetCount*(count: var int32): CUresult
proc cuDeviceGet*(device: var CUdevice, ordinal: int32): CUresult
proc cuDeviceGetName*(name: ptr char, len: int32, dev: CUdevice): CUresult
proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevice): CUresult
proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult
proc cuCtxDestroy*(ctx: CUcontext): CUresult
proc cuCtxSynchronize*(ctx: CUcontext): CUresult
proc cuModuleLoadData(module: var CUmodule, sourceCode: ptr char): CUresult
proc cuModuleUnload*(module: CUmodule): CUresult
proc cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult
proc cuLaunchKernel*(
kernel: CUfunction,
gridDimX, gridDimY, gridDimZ: uint32,
blockDimX, blockDimY, blockDimZ: uint32,
sharedMemBytes: uint32,
stream: CUstream,
kernelParams: ptr pointer,
extra: ptr pointer
): CUresult
type
CUmemAttach_flags* = enum
CU_MEM_ATTACH_GLOBAL = 0x1, ## Memory can be accessed by any stream on any device
CU_MEM_ATTACH_HOST = 0x2, ## Memory cannot be accessed by any stream on any device
CU_MEM_ATTACH_SINGLE = 0x4
proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult
proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult
proc cuMemFree*(devptr: CUdeviceptr): CUresult
proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult
{.pop.} # {.push cdecl, importc, dynlib: "libcuda.so".}
func cuModuleLoadData*(module: var CUmodule, sourceCode: openArray[char]): CUresult {.inline.}=
cuModuleLoadData(module, sourceCode[0].unsafeAddr)
func cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: openArray[char]): CUresult {.inline.}=
cuModuleGetFunction(kernel, module, fnName[0].unsafeAddr)
proc cudaDeviceInit(): CUdevice =
check cuInit(0)
var devCount: int32
check cuDeviceGetCount(devCount)
if devCount == 0:
echo "cudaDeviceInit error: no devices supporting CUDA"
quit 1
var cuDevice: CUdevice
check cuDeviceGet(cuDevice, 0)
var name = newString(128)
check cuDeviceGetName(name[0].addr, name.len.int32, cuDevice)
echo "Using CUDA Device [0]: ", name
var major, minor: int32
check cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)
check cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)
echo "Compute Capability: SM ", major, ".", minor
if major < 6:
echo "Error: Device 0 is not sm_60 (Pascal generation, GTX 1080) or later"
quit 1
return cuDevice
# ############################################################
#
# NVVM
#
# ############################################################
{.passl: "-L/opt/cuda/nvvm/lib64 -lnvvm".}
type
NvvmResult* {.size: sizeof(cint).} = enum
NVVM_SUCCESS = 0
NVVM_ERROR_OUT_OF_MEMORY = 1
NVVM_ERROR_PROGRAM_CREATION_FAILURE = 2
NVVM_ERROR_IR_VERSION_MISMATCH = 3
NVVM_ERROR_INVALID_INPUT = 4
NVVM_ERROR_INVALID_PROGRAM = 5
NVVM_ERROR_INVALID_IR = 6
NVVM_ERROR_INVALID_OPTION = 7
NVVM_ERROR_NO_MODULE_IN_PROGRAM = 8
NVVM_ERROR_COMPILATION = 9
NvvmProgram = distinct pointer
{.push cdecl, importc, dynlib: "libnvvm.so".}
proc nvvmGetErrorString*(r: NvvmResult): cstring
proc nvvmVersion*(major, minor: var int32): NvvmResult
proc nvvmIRVersion*(majorIR, minorIR, majorDbg, minorDbg: var int32): NvvmResult
proc nvvmCreateProgram*(prog: var NvvmProgram): NvvmResult
proc nvvmDestroyProgram*(prog: var NvvmProgram): NvvmResult
proc nvvmAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult
proc nvvmLazyAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult
proc nvvmCompileProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
proc nvvmVerifyProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
proc nvvmGetCompiledResultSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult
proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult
{.pop.} # {.push cdecl, importc, header: "<nvvm.h>".}
# ############################################################
#
# NVVM IR
#
# ############################################################
proc tagCudaKernel*(module: ModuleRef, function: ValueRef) =
## Tag a function as a Cuda Kernel, i.e. callable from host
# Upstream bug, getReturnType returns tkFunction for void functions.
# doAssert function.getTypeOf().getReturnType().isVoid(), block:
# "Kernels must not return values but function returns " & $function.getTypeOf().getReturnType().getTypeKind()
let ctx = module.getContext()
module.addNamedMetadataOperand(
"nvvm.annotations",
ctx.asValueRef(ctx.metadataNode([
function.asMetadataRef(),
ctx.metadataNode("kernel"),
constInt(ctx.int32_t(), 1, LlvmBool(false)).asMetadataRef()
]))
)
# ############################################################
#
# Sanity Check
#
# ############################################################
when isMainModule:
template check(status: NvvmResult) =
let code = status # Assign so execution is done once only.
if code != NVVM_SUCCESS:
echo astToStr(status), " ", instantiationInfo(), " exited with error: ", code
echo code.nvvmGetErrorString()
quit 1
echo "Nvidia JIT compiler sanity check"
#######################################
# Metadata
const triple = "nvptx64-nvidia-cuda"
var irVersion: tuple[major, minor, majorDbg, minorDbg: int32]
block:
var version: tuple[major, minor: int32]
check: nvvmVersion(version.major, version.minor)
echo "nvvm v", version.major, ".", version.minor
check: nvvmIRVersion(irVersion.major, irVersion.minor, irVersion.majorDbg, irVersion.minorDbg)
echo "requires LLVM IR v", irVersion.major, ".", irVersion.minor
#######################################
# LLVM IR codegen
# Datalayout for NVVM IR 1.8 (CUDA 11.6)
const datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
let ctx = createContext()
var module = ctx.createModule("test_nnvm")
module.setTarget(triple)
module.setDataLayout(datalayout)
let i128 = ctx.int128_t()
let void_t = ctx.void_t()
let builder = ctx.createBuilder()
block:
let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
let addKernel = module.addFunction("addKernel", addType)
let blck = ctx.append_basic_block(addKernel, "addBody")
builder.positionAtEnd(blck)
let r = addKernel.getParam(0)
let a = addKernel.getParam(1)
let b = addKernel.getParam(2)
let sum = builder.add(a, b, "sum")
discard builder.store(sum, r)
discard builder.retVoid()
module.tagCudaKernel(addKernel)
block:
let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
let mulKernel = module.addFunction("mulKernel", mulType)
let blck = ctx.append_basic_block(mulKernel, "mulBody")
builder.positionAtEnd(blck)
let r = mulKernel.getParam(0)
let a = mulKernel.getParam(1)
let b = mulKernel.getParam(2)
let prod = builder.mul(a, b, "prod")
discard builder.store(prod, r)
discard builder.retVoid()
module.tagCudaKernel(mulKernel)
block:
var errMsg: cstring
let errCode = module.verify(AbortProcessAction, errMsg)
echo "LLVM verification: code ", int(errCode), ", message \"", errMsg, "\""
errMsg.dispose()
block:
echo "================="
echo "LLVM IR output"
let ir = module.toIRString()
echo ir
ir.dispose()
echo "================="
#######################################
# LLVM -> NNVM handover
var prog{.noInit.}: NvvmProgram
check nvvmCreateProgram(prog)
# module.writeBitcodeToFile("arith.bc")
let bitcode = module.toBitcode()
check nvvmAddModuleToProgram(prog, bitcode, "arith")
# Cleanup LLVM
builder.dispose()
module.dispose()
ctx.dispose()
#######################################
# GPU codegen
check nvvmVerifyProgram(prog, 0, nil)
block:
var logSize: csize_t
check nvvmGetProgramLogSize(prog, logSize)
var log = newString(logSize)
check nvvmGetProgramLog(prog, log[0].addr)
echo "log:"
echo log
echo "----------------"
check nvvmCompileProgram(prog, 0, nil)
var ptxSize: csize_t
check nvvmGetCompiledResultSize(prog, ptxSize)
var ptx = newString(ptxSize)
check nvvmGetCompiledResult(prog, ptx[0].addr)
block:
var logSize: csize_t
check nvvmGetProgramLogSize(prog, logSize)
var log = newString(logSize)
check nvvmGetProgramLog(prog, log[0].addr)
echo "log:"
echo log
echo "----------------"
check nvvmDestroyProgram(prog)
echo "================="
echo "PTX output"
echo ptx
echo "================="
let cudaDevice = cudaDeviceInit()
var cuCtx: CUcontext
var cuMod: CUmodule
var addKernel, mulKernel: CUfunction
check cuCtxCreate(cuCtx, 0, cudaDevice)
check cuModuleLoadData(cuMod, ptx)
check cuModuleGetFunction(addKernel, cuMod, "addKernel")
check cuModuleGetFunction(mulKernel, cuMod, "mulKernel")
#######################################
# Kernel launch
func toHex*(a: uint64): string =
const hexChars = "0123456789abcdef"
const L = 2*sizeof(uint64)
result = newString(L)
var a = a
for j in countdown(result.len-1, 2):
result[j] = hexChars[a and 0xF]
a = a shr 4
func toString*(a: openArray[uint64]): string =
result = "0x"
for i in countdown(result.len-1, 0):
result.add toHex(a[i])
var r{.noInit.}, a, b: array[2, uint64]
a[1] = 0x00000000000001FF'u64; a[0] = 0xFFFFFFFFFFFFFFFF'u64
b[1] = 0x0000000000000000'u64; b[0] = 0x0010000000000000'u64
echo "r: ", r.toString()
echo "a: ", a.toString()
echo "b: ", b.toString()
var rGPU: CUdeviceptr
check cuMemAlloc(rGPU, csize_t sizeof(r))
let params = [pointer(rGPU.addr), pointer(a.addr), pointer(b.addr)]
check cuLaunchKernel(
addKernel,
1, 1, 1,
1, 1, 1,
0, CUstream(nil),
params[0].unsafeAddr, nil)
check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
echo "a+b: ", r.toString()
check cuLaunchKernel(
mulKernel,
1, 1, 1,
1, 1, 1,
0, CUstream(nil),
params[0].unsafeAddr, nil)
check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
echo "a*b: ", r.toString()
#######################################
# Cleanup
check cuMemFree(rGPU)
rGPU = CUdeviceptr(nil)
check cuModuleUnload(cuMod)
cuMod = CUmodule(nil)
check cuCtxDestroy(cuCtx)
cuCtx = CUcontext(nil)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment