Skip to content

Instantly share code, notes, and snippets.

@MasonProtter
Last active August 16, 2021 13:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MasonProtter/7de88773b44f35cad796fd66e253a78a to your computer and use it in GitHub Desktop.
Save MasonProtter/7de88773b44f35cad796fd66e253a78a to your computer and use it in GitHub Desktop.
#+BEGIN_SRC jupyter-julia
using CUDA
struct StaticString{N} <: AbstractString
chars::NTuple{N, Char}
end
macro s_str(s)
chars = tuple(collect(s)...)
N = length(chars)
esc(:(StaticString{$N}($chars)))
end
Base.String(s::StaticString) = String(collect(s.chars))
Base.show(io::IO, s::StaticString{N}) where {N} = print(io, "StaticString{$N}(\"$(String(s))\")")
Base.:(*)(s1::StaticString{N}, s2::StaticString{M}) where {N, M} = StaticString{N + M}((s1.chars..., s2.chars...))
@device_code_ptx cu([s"abc ", s"123 "]) .* cu([s"hello ", s"goodbye"])
#+END_SRC
#+RESULTS:
#+BEGIN_EXAMPLE
// PTX CompilerJob of kernel broadcast_kernel(CUDA.CuKernelContext, CuDeviceArray{StaticString{11},1,CUDA.AS.Global}, Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64}},typeof(*),Tuple{Base.Broadcast.Extruded{CuDeviceArray{StaticString{4},1,CUDA.AS.Global},Tuple{Bool},Tuple{Int64}},Base.Broadcast.Extruded{CuDeviceArray{StaticString{7},1,CUDA.AS.Global},Tuple{Bool},Tuple{Int64}}}}, Int64) for sm_75
//
// Generated by LLVM NVPTX Back-End
//
.version 6.3
.target sm_75
.address_size 64
// .globl _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64 // -- Begin function _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64
.weak .global .align 8 .u64 exception_flag;
// @_Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64
.visible .entry _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64(
.param .align 8 .b8 _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_0[16],
.param .align 8 .b8 _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_1[72],
.param .u64 _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_2
)
{
.reg .pred %p<10>;
.reg .b16 %rs<6>;
.reg .b32 %r<28>;
.reg .b64 %rd<63>;
// %bb.0: // %top
ld.param.u64 %rd39, [_Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_2];
setp.lt.s64 %p1, %rd39, 1;
@%p1 bra LBB0_8;
// %bb.1: // %L12.preheader
mov.b64 %rd37, _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_0;
mov.b64 %rd38, _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_1;
ld.param.u64 %rd1, [%rd38+8];
ld.param.u64 %rd3, [%rd38+40];
ld.param.u64 %rd4, [%rd38+56];
ld.param.u8 %rs1, [%rd38+16];
ld.param.u8 %rs2, [%rd38+48];
ld.param.u64 %rd5, [%rd37];
ld.param.u64 %rd6, [%rd37+8];
setp.gt.s64 %p2, %rd39, 0;
selp.b64 %rd62, %rd39, 0, %p2;
mov.u32 %r1, %tid.x;
add.s32 %r2, %r1, 1;
cvt.u64.u32 %rd8, %r2;
mov.u32 %r3, %ctaid.x;
cvt.u64.u32 %rd9, %r3;
mov.u32 %r4, %ntid.x;
cvt.u64.u32 %rd10, %r4;
mov.u32 %r5, %nctaid.x;
cvt.u64.u32 %rd11, %r5;
and.b16 %rs3, %rs1, 255;
setp.eq.s16 %p3, %rs3, 0;
@%p3 bra LBB0_3;
// %bb.2: // %L12.preheader36
mul.lo.s64 %rd40, %rd10, %rd9;
add.s64 %rd61, %rd8, %rd40;
add.s64 %rd41, %rd61, -1;
mul.lo.s64 %rd13, %rd11, %rd10;
shl.b64 %rd42, %rd41, 4;
add.s64 %rd43, %rd1, %rd42;
add.s64 %rd60, %rd43, 8;
shl.b64 %rd15, %rd13, 4;
mul.lo.s64 %rd44, %rd41, 44;
add.s64 %rd45, %rd6, %rd44;
add.s64 %rd59, %rd45, 20;
mul.lo.s64 %rd17, %rd13, 44;
and.b16 %rs4, %rs2, 255;
setp.eq.s16 %p5, %rs4, 0;
LBB0_7: // %L12
// =>This Inner Loop Header: Depth=1
setp.ge.s64 %p4, %rd5, %rd61;
@%p4 bra LBB0_6;
bra.uni LBB0_8;
LBB0_6: // %L63
// in Loop: Header=BB0_7 Depth=1
ld.global.u32 %r6, [%rd60+-8];
ld.global.u32 %r7, [%rd60+-4];
ld.global.u32 %r8, [%rd60];
ld.global.u32 %r9, [%rd60+4];
selp.b64 %rd46, %rd4, %rd61, %p5;
mul.lo.s64 %rd47, %rd46, 28;
add.s64 %rd48, %rd3, %rd47;
ld.global.u32 %r10, [%rd48+-28];
ld.global.u32 %r11, [%rd48+-24];
ld.global.u32 %r12, [%rd48+-20];
ld.global.u32 %r13, [%rd48+-16];
ld.global.u32 %r14, [%rd48+-12];
ld.global.u32 %r15, [%rd48+-8];
ld.global.u32 %r16, [%rd48+-4];
st.global.u32 [%rd59+-20], %r6;
st.global.u32 [%rd59+-16], %r7;
st.global.u32 [%rd59+-12], %r8;
st.global.u32 [%rd59+-8], %r9;
st.global.u32 [%rd59+-4], %r10;
st.global.u32 [%rd59], %r11;
st.global.u32 [%rd59+4], %r12;
st.global.u32 [%rd59+8], %r13;
st.global.u32 [%rd59+12], %r14;
st.global.u32 [%rd59+16], %r15;
st.global.u32 [%rd59+20], %r16;
add.s64 %rd62, %rd62, -1;
add.s64 %rd61, %rd61, %rd13;
add.s64 %rd60, %rd60, %rd15;
add.s64 %rd59, %rd59, %rd17;
setp.eq.s64 %p6, %rd62, 0;
@%p6 bra LBB0_8;
bra.uni LBB0_7;
LBB0_3: // %L12.preheader.split.us
ld.param.u64 %rd2, [%rd38+24];
shl.b64 %rd49, %rd2, 4;
add.s64 %rd18, %rd1, %rd49;
mul.lo.s64 %rd50, %rd10, %rd9;
add.s64 %rd57, %rd8, %rd50;
mul.lo.s64 %rd51, %rd57, 44;
mul.lo.s64 %rd20, %rd11, %rd10;
add.s64 %rd52, %rd51, %rd6;
add.s64 %rd56, %rd52, -24;
mul.lo.s64 %rd22, %rd20, 44;
and.b16 %rs5, %rs2, 255;
setp.eq.s16 %p8, %rs5, 0;
LBB0_4: // %L12.us
// =>This Inner Loop Header: Depth=1
setp.lt.s64 %p7, %rd5, %rd57;
@%p7 bra LBB0_8;
// %bb.5: // %L63.us
// in Loop: Header=BB0_4 Depth=1
ld.global.u32 %r17, [%rd18+-16];
ld.global.u32 %r18, [%rd18+-12];
ld.global.u32 %r19, [%rd18+-8];
ld.global.u32 %r20, [%rd18+-4];
selp.b64 %rd53, %rd4, %rd57, %p8;
mul.lo.s64 %rd54, %rd53, 28;
add.s64 %rd55, %rd3, %rd54;
ld.global.u32 %r21, [%rd55+-28];
ld.global.u32 %r22, [%rd55+-24];
ld.global.u32 %r23, [%rd55+-20];
ld.global.u32 %r24, [%rd55+-16];
ld.global.u32 %r25, [%rd55+-12];
ld.global.u32 %r26, [%rd55+-8];
ld.global.u32 %r27, [%rd55+-4];
st.global.u32 [%rd56+-20], %r17;
st.global.u32 [%rd56+-16], %r18;
st.global.u32 [%rd56+-12], %r19;
st.global.u32 [%rd56+-8], %r20;
st.global.u32 [%rd56+-4], %r21;
st.global.u32 [%rd56], %r22;
st.global.u32 [%rd56+4], %r23;
st.global.u32 [%rd56+8], %r24;
st.global.u32 [%rd56+12], %r25;
st.global.u32 [%rd56+16], %r26;
st.global.u32 [%rd56+20], %r27;
add.s64 %rd62, %rd62, -1;
add.s64 %rd57, %rd57, %rd20;
add.s64 %rd56, %rd56, %rd22;
setp.eq.s64 %p9, %rd62, 0;
@%p9 bra LBB0_8;
bra.uni LBB0_4;
LBB0_8: // %L62
ret;
// -- End function
}
#+END_EXAMPLE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment