Skip to content

Instantly share code, notes, and snippets.

@MasonProtter
Created August 20, 2020 05:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MasonProtter/8f8ae49ff63eb6f1e1f19e11fb2a3ebe to your computer and use it in GitHub Desktop.
Save MasonProtter/8f8ae49ff63eb6f1e1f19e11fb2a3ebe to your computer and use it in GitHub Desktop.
#+BEGIN_SRC jupyter-julia
using CUDA
struct StaticString{N} <: AbstractString
chars::NTuple{N, Char}
end
macro s_str(s)
chars = tuple(collect(s)...)
N = length(chars)
esc(:(StaticString{$N}($chars)))
end
Base.String(s::StaticString) = String(collect(s.chars))
Base.show(io::IO, s::StaticString{N}) where {N} = print(io, "StaticString{$N}(\"$(String(s))\")")
Base.:(*)(s1::StaticString{N}, s2::StaticString{M}) where {N, M} = StaticString{N + M}((s1.chars..., s2.chars...))
@device_code_sass cu([s"abc ", s"123 "]) .* cu([s"hello ", s"goodbye"])
#+END_SRC
#+RESULTS:
#+BEGIN_EXAMPLE
// PTX CompilerJob of kernel broadcast_kernel(CUDA.CuKernelContext, CuDeviceArray{StaticString{11},1,CUDA.AS.Global}, Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64}},typeof(*),Tuple{Base.Broadcast.Extruded{CuDeviceArray{StaticString{4},1,CUDA.AS.Global},Tuple{Bool},Tuple{Int64}},Base.Broadcast.Extruded{CuDeviceArray{StaticString{7},1,CUDA.AS.Global},Tuple{Bool},Tuple{Int64}}}}, Int64) for sm_75
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM75 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM75)"
.elftype @"ET_EXEC"
//--------------------- .text._Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64 --------------------------
.section .text._Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64,"ax",@progbits
.sectioninfo @"SHI_REGISTERS=40"
.align 128
.global _Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64
.type _Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64,@function
.size _Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64,(.L_27 - _Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64)
.other _Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64,@"STO_CUDA_ENTRY STV_DEFAULT"
_Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64:
.text._Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64:
IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;
IMAD.MOV.U32 R0, RZ, RZ, c[0x0][0x1b8] ;
IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x1bc] ;
ISETP.GE.U32.AND P0, PT, R0, 0x1, PT ;
ISETP.GE.AND.EX P0, PT, R2, RZ, PT, P0 ;
@!P0 EXIT ;
S2R R6, SR_TID.X ;
ULDC.U8 UR4, c[0x0][0x180] ;
ISETP.LT.U32.AND P0, PT, RZ, c[0x0][0x1b8], PT ;
ULOP3.LUT UR4, UR4, 0xff, URZ, 0xc0, !UP7 ;
S2R R9, SR_CTAID.X ;
ULDC.U8 UR6, c[0x0][0x1a0] ;
ISETP.GT.AND.EX P0, PT, R2, RZ, PT, P0 ;
ULOP3.LUT UR6, UR6, 0xff, URZ, 0xc0, !UP7 ;
ISETP.NE.AND P1, PT, RZ, UR4, PT ;
SEL R0, RZ, c[0x0][0x1b8], !P0 ;
SEL R2, RZ, c[0x0][0x1bc], !P0 ;
IADD3 R6, R6, 0x1, RZ ;
@!P1 BRA `(.L_2) ;
IMAD.MOV.U32 R7, RZ, RZ, RZ ;
UMOV UR4, URZ ;
IMAD.MOV.U32 R8, RZ, RZ, c[0x0][0x168] ;
ISETP.NE.AND P1, PT, RZ, UR6, PT ;
IMAD.WIDE.U32 R6, R9, c[0x0][0x0], R6 ;
MOV R9, c[0x0][0x16c] ;
ISETP.GT.U32.AND P0, PT, R6.reuse, c[0x0][0x160], PT ;
IADD3 R16, R7, UR4, RZ ;
IADD3 R3, P2, R6, -0x1, RZ ;
ISETP.GT.AND.EX P0, PT, R16.reuse, c[0x0][0x164], PT, P0 ;
LEA R10, P3, R3.reuse, c[0x0][0x178], 0x4 ;
IMAD.WIDE.U32 R8, R3, 0x2c, R8 ;
IADD3.X R4, R16, -0x1, RZ, P2, !PT ;
@P0 EXIT ;
IMAD.MOV.U32 R5, RZ, RZ, c[0x0][0x0] ;
LEA.HI.X R7, R3, c[0x0][0x17c], R4, 0x4, P3 ;
IMAD R31, R4, 0x2c, RZ ;
UMOV UR4, URZ ;
IMAD.WIDE.U32 R4, R5, c[0x0][0xc], RZ ;
IADD3 R14, P2, R8, 0x14, RZ ;
IADD3 R10, P0, R10, 0x8, RZ ;
IMAD.MOV.U32 R33, RZ, RZ, R6 ;
IMAD.X R31, R9, 0x1, R31, P2 ;
IADD3 R37, R5, UR4, RZ ;
IMAD.X R7, RZ, RZ, R7, P0 ;
SHF.L.U64.HI R35, R4, 0x4, R37 ;
.L_3:
SEL R3, R33, c[0x0][0x1a8], P1 ;
IMAD.MOV.U32 R9, RZ, RZ, c[0x0][0x19c] ;
MOV R8, c[0x0][0x198] ;
SEL R6, R16, c[0x0][0x1ac], P1 ;
IMAD.WIDE.U32 R8, R3, 0x1c, R8 ;
IMAD R11, R6, 0x1c, RZ ;
IMAD.MOV.U32 R6, RZ, RZ, R10 ;
IMAD.IADD R9, R9, 0x1, R11 ;
LDG.E.SYS R3, [R6+-0x8] ;
LDG.E.SYS R11, [R6+-0x4] ;
LDG.E.SYS R13, [R6] ;
LDG.E.SYS R15, [R6+0x4] ;
LDG.E.SYS R17, [R8+-0x1c] ;
LDG.E.SYS R19, [R8+-0x18] ;
LDG.E.SYS R21, [R8+-0x14] ;
LDG.E.SYS R23, [R8+-0x10] ;
LDG.E.SYS R25, [R8+-0xc] ;
LDG.E.SYS R27, [R8+-0x8] ;
LDG.E.SYS R29, [R8+-0x4] ;
IADD3 R10, P0, R0, -0x1, RZ ;
IADD3 R0, P2, R33, R4, RZ ;
IADD3.X R12, R2, -0x1, RZ, P0, !PT ;
ISETP.NE.U32.AND P0, PT, R10, RZ, PT ;
IMAD.X R2, R16, 0x1, R37, P2 ;
MOV R9, R31 ;
IMAD.MOV.U32 R8, RZ, RZ, R14 ;
ISETP.NE.AND.EX P0, PT, R12, RZ, PT, P0 ;
STG.E.SYS [R8+-0x14], R3 ;
STG.E.SYS [R8+-0x10], R11 ;
STG.E.SYS [R8+-0xc], R13 ;
LEA R3, P2, R4, R6, 0x4 ;
STG.E.SYS [R8+-0x8], R15 ;
STG.E.SYS [R8+-0x4], R17 ;
STG.E.SYS [R8], R19 ;
STG.E.SYS [R8+0x4], R21 ;
STG.E.SYS [R8+0x8], R23 ;
STG.E.SYS [R8+0xc], R25 ;
STG.E.SYS [R8+0x10], R27 ;
STG.E.SYS [R8+0x14], R29 ;
@!P0 EXIT ;
IMAD.MOV.U32 R33, RZ, RZ, R0 ;
IMAD.MOV.U32 R16, RZ, RZ, R2 ;
IMAD.WIDE.U32 R8, R4, 0x2c, R8 ;
ISETP.GT.U32.AND P0, PT, R33, c[0x0][0x160], PT ;
IMAD R31, R37, 0x2c, RZ ;
ISETP.GT.AND.EX P0, PT, R16, c[0x0][0x164], PT, P0 ;
IMAD.X R6, R7, 0x1, R35, P2 ;
IMAD.MOV.U32 R0, RZ, RZ, R10 ;
IADD3 R31, R9, R31, RZ ;
IMAD.MOV.U32 R14, RZ, RZ, R8 ;
MOV R7, R6 ;
IMAD.MOV.U32 R2, RZ, RZ, R12 ;
IMAD.MOV.U32 R10, RZ, RZ, R3 ;
@!P0 BRA `(.L_3) ;
EXIT ;
.L_2:
IMAD.MOV.U32 R7, RZ, RZ, RZ ;
UMOV UR8, URZ ;
ISETP.NE.AND P1, PT, RZ, UR6, PT ;
ULDC UR5, c[0x0][0x188] ;
IMAD.WIDE.U32 R8, R9, c[0x0][0x0], R6 ;
ULDC UR4, c[0x0][0x178] ;
ULEA UR4, UP0, UR5, UR4, 0x4 ;
IMAD.MOV.U32 R6, RZ, RZ, c[0x0][0x168] ;
ULDC UR7, c[0x0][0x18c] ;
IMAD.MOV.U32 R7, RZ, RZ, c[0x0][0x16c] ;
ISETP.GT.U32.AND P0, PT, R8, c[0x0][0x160], PT ;
IADD3 R12, R9, UR8, RZ ;
ULDC UR8, c[0x0][0x17c] ;
IMAD.WIDE.U32 R6, R8, 0x2c, R6 ;
ULEA.HI.X UR5, UR5, UR8, UR7, 0x4, UP0 ;
ISETP.GT.AND.EX P0, PT, R12, c[0x0][0x164], PT, P0 ;
@P0 EXIT ;
IMAD.MOV.U32 R4, RZ, RZ, c[0x0][0x0] ;
IADD3 R10, P0, R6, -0x18, RZ ;
IMAD R29, R12, 0x2c, RZ ;
BMOV.32.CLEAR RZ, B0 ;
IMAD.WIDE.U32 R4, R4, c[0x0][0xc], RZ ;
UMOV UR6, URZ ;
BSSY B0, `(.L_4) ;
IADD3.X R29, R7, -0x1, R29, P0, !PT ;
MOV R31, R8 ;
IADD3 R33, R5, UR6, RZ ;
.L_5:
SEL R3, R31, c[0x0][0x1a8], P1 ;
IMAD.MOV.U32 R6, RZ, RZ, c[0x0][0x198] ;
SEL R8, R12, c[0x0][0x1ac], P1 ;
IMAD.MOV.U32 R7, RZ, RZ, c[0x0][0x19c] ;
LDG.E.SYS R11, [UR4+-0x8] ;
IMAD.WIDE.U32 R6, R3, 0x1c, R6 ;
LDG.E.SYS R13, [UR4+-0x4] ;
IMAD R9, R8, 0x1c, RZ ;
LDG.E.SYS R3, [UR4+-0x10] ;
IMAD.IADD R7, R7, 0x1, R9 ;
LDG.E.SYS R9, [UR4+-0xc] ;
LDG.E.SYS R15, [R6+-0x1c] ;
LDG.E.SYS R17, [R6+-0x18] ;
LDG.E.SYS R19, [R6+-0x14] ;
LDG.E.SYS R21, [R6+-0x10] ;
LDG.E.SYS R23, [R6+-0xc] ;
LDG.E.SYS R25, [R6+-0x8] ;
LDG.E.SYS R27, [R6+-0x4] ;
IADD3 R8, P0, R0, -0x1, RZ ;
IADD3.X R14, R2, -0x1, RZ, P0, !PT ;
ISETP.NE.U32.AND P0, PT, R8, RZ, PT ;
ISETP.NE.AND.EX P0, PT, R14, RZ, PT, P0 ;
IMAD.MOV.U32 R6, RZ, RZ, R10 ;
MOV R7, R29 ;
IADD3 R0, P2, R31, R4, RZ ;
IMAD.X R2, R12, 0x1, R33, P2 ;
STG.E.SYS [R6+-0xc], R11 ;
STG.E.SYS [R6+-0x8], R13 ;
STG.E.SYS [R6+-0x14], R3 ;
STG.E.SYS [R6+-0x10], R9 ;
STG.E.SYS [R6+-0x4], R15 ;
STG.E.SYS [R6], R17 ;
STG.E.SYS [R6+0x4], R19 ;
STG.E.SYS [R6+0x8], R21 ;
STG.E.SYS [R6+0xc], R23 ;
STG.E.SYS [R6+0x10], R25 ;
STG.E.SYS [R6+0x14], R27 ;
@!P0 EXIT ;
IMAD.MOV.U32 R31, RZ, RZ, R0 ;
IMAD.MOV.U32 R12, RZ, RZ, R2 ;
IMAD.WIDE.U32 R6, R4, 0x2c, R6 ;
ISETP.GT.U32.AND P0, PT, R31, c[0x0][0x160], PT ;
IMAD R29, R33, 0x2c, RZ ;
ISETP.GT.AND.EX P0, PT, R12, c[0x0][0x164], PT, P0 ;
IMAD.MOV.U32 R0, RZ, RZ, R8 ;
MOV R10, R6 ;
IMAD.IADD R29, R29, 0x1, R7 ;
IMAD.MOV.U32 R2, RZ, RZ, R14 ;
@!P0 BRA `(.L_5) ;
BSYNC B0 ;
.L_4:
EXIT ;
.L_6:
BRA `(.L_6);
.L_27:
#+END_EXAMPLE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment