Skip to content

Instantly share code, notes, and snippets.

@maxwindiff

maxwindiff/1d.jl Secret

Created February 14, 2023 07:18
Show Gist options
  • Save maxwindiff/8d530dd43b3d626bf0aac3fe4f943ee9 to your computer and use it in GitHub Desktop.
Save maxwindiff/8d530dd43b3d626bf0aac3fe4f943ee9 to your computer and use it in GitHub Desktop.
julia> using Revise, Metal, BenchmarkTools, StaticArrays
julia> n = 16384;
julia> nn = n * n;
julia> a = MtlArray(rand(Float32, n, n));
julia> b = MtlArray(rand(Float32, n, n));
julia> function copy1d(a, b, ::Val{dims}) where {dims}
I = @inbounds CartesianIndices(dims)[thread_position_in_grid_1d()]
@inbounds a[I] = b[I]
return
end
copy1d (generic function with 1 method)
julia> function run1d()
@Metal.sync begin
@metal threads=1024 grid=cld(nn, 1024) copy1d(a, b, Val(axes(a)))
end
end
run1d (generic function with 1 method)
julia> @device_code_llvm run1d()
; CompilerJob of kernel #copy1d(MtlDeviceMatrix{Float32, 1}, MtlDeviceMatrix{Float32, 1}, Val{(Base.OneTo(16384), Base.OneTo(16384))}) for GPUCompiler.MetalCompilerTarget
; @ REPL[6]:1 within `copy1d`
define cc103 void @_Z17julia_copy1d_213614MtlDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EE3ValI38_Base_OneTo_16384___Base_OneTo_16384__E({ i8 addrspace(1)*, [2 x i64] } addrspace(1)* %0, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %1, i32 %thread_position_in_grid) local_unnamed_addr #0 {
conversion:
%2 = bitcast { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %0 to float addrspace(1)* addrspace(1)*
%.unpack12 = load float addrspace(1)*, float addrspace(1)* addrspace(1)* %2, align 8
%.unpack8.elt = getelementptr inbounds { i8 addrspace(1)*, [2 x i64] }, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %0, i64 0, i32 1, i64 0
%.unpack8.unpack = load i64, i64 addrspace(1)* %.unpack8.elt, align 8
%3 = bitcast { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %1 to float addrspace(1)* addrspace(1)*
%.unpack18 = load float addrspace(1)*, float addrspace(1)* addrspace(1)* %3, align 8
%.unpack14.elt = getelementptr inbounds { i8 addrspace(1)*, [2 x i64] }, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %1, i64 0, i32 1, i64 0
%.unpack14.unpack = load i64, i64 addrspace(1)* %.unpack14.elt, align 8
; @ REPL[6]:2 within `copy1d`
; ┌ @ /Users/kichi/.julia/dev/Metal/src/device/intrinsics/arguments.jl:48 within `thread_position_in_grid_1d`
; │┌ @ int.jl:87 within `+`
%4 = add i32 %thread_position_in_grid, 1
; └└
; ┌ @ abstractarray.jl:1241 within `getindex`
; │┌ @ indices.jl:330 within `to_indices` @ indices.jl:333
; ││┌ @ indices.jl:277 within `to_index` @ indices.jl:292
; │││┌ @ number.jl:7 within `convert`
; ││││┌ @ boot.jl:764 within `Int64`
; │││││┌ @ boot.jl:688 within `toInt64`
%5 = zext i32 %4 to i64
; │└└└└└
; │┌ @ abstractarray.jl:1286 within `_getindex`
; ││┌ @ abstractarray.jl:1293 within `_to_subscript_indices`
; │││┌ @ abstractarray.jl:1315 within `_unsafe_ind2sub`
; ││││┌ @ abstractarray.jl:2639 within `_ind2sub` @ abstractarray.jl:2677
; │││││┌ @ int.jl:86 within `-`
%6 = add nsw i64 %5, -1
; │││││└
; │││││┌ @ abstractarray.jl:2690 within `_ind2sub_recurse`
; ││││││┌ @ abstractarray.jl:2697 within `_div`
; │││││││┌ @ int.jl:288 within `div`
%7 = sdiv i64 %6, 16384
; ││││││└└
; ││││││ @ abstractarray.jl:2691 within `_ind2sub_recurse`
; ││││││┌ @ int.jl:88 within `*`
%.neg = mul nsw i64 %7, -16384
; ││││││└
; ││││││┌ @ int.jl:87 within `+`
%8 = add nsw i64 %.neg, %5
; └└└└└└└
; @ REPL[6]:3 within `copy1d`
; ┌ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:115 within `getindex`
; │┌ @ abstractarray.jl:1280 within `_to_linear_index`
; ││┌ @ abstractarray.jl:2634 within `_sub2ind`
; │││┌ @ abstractarray.jl:95 within `axes`
; ││││┌ @ tuple.jl:222 within `map`
; │││││┌ @ range.jl:455 within `oneto`
; ││││││┌ @ range.jl:453 within `OneTo` @ range.jl:440
; │││││││┌ @ promotion.jl:488 within `max`
; ││││││││┌ @ essentials.jl:489 within `ifelse`
%9 = icmp sgt i64 %.unpack14.unpack, 0
%10 = select i1 %9, i64 %.unpack14.unpack, i64 0
; │││└└└└└└
; │││ @ abstractarray.jl:2634 within `_sub2ind` @ abstractarray.jl:2650
; │││┌ @ abstractarray.jl:2666 within `_sub2ind_recurse` @ abstractarray.jl:2666
; ││││┌ @ int.jl:88 within `*`
%11 = mul i64 %10, %7
; ││││└
; ││││┌ @ int.jl:87 within `+`
%12 = add nsw i64 %8, -1
; │└└└└
; │ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:115 within `getindex` @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:103
; │┌ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:82 within `arrayref`
; ││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:81 within `unsafe_load`
; │││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:9 within `pointerref`
; ││││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:9 within `macro expansion` @ /Users/kichi/.julia/dev/LLVM/src/interop/base.jl:40
; │││││┌ @ int.jl:86 within `-`
%13 = add i64 %12, %11
; │││││└
%14 = getelementptr inbounds float, float addrspace(1)* %.unpack18, i64 %13
%15 = load float, float addrspace(1)* %14, align 4
; └└└└└
; ┌ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:118 within `setindex!`
; │┌ @ abstractarray.jl:1280 within `_to_linear_index`
; ││┌ @ abstractarray.jl:2634 within `_sub2ind`
; │││┌ @ abstractarray.jl:95 within `axes`
; ││││┌ @ tuple.jl:222 within `map`
; │││││┌ @ range.jl:455 within `oneto`
; ││││││┌ @ range.jl:453 within `OneTo` @ range.jl:440
; │││││││┌ @ promotion.jl:488 within `max`
; ││││││││┌ @ essentials.jl:489 within `ifelse`
%16 = icmp sgt i64 %.unpack8.unpack, 0
%17 = select i1 %16, i64 %.unpack8.unpack, i64 0
; │││└└└└└└
; │││ @ abstractarray.jl:2634 within `_sub2ind` @ abstractarray.jl:2650
; │││┌ @ abstractarray.jl:2666 within `_sub2ind_recurse` @ abstractarray.jl:2666
; ││││┌ @ int.jl:88 within `*`
%18 = mul i64 %17, %7
; │└└└└
; │ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:118 within `setindex!` @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:105
; │┌ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:88 within `arrayset`
; ││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:84 within `unsafe_store!`
; │││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:44 within `pointerset`
; ││││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:44 within `macro expansion` @ /Users/kichi/.julia/dev/LLVM/src/interop/base.jl:40
; │││││┌ @ int.jl:86 within `-`
%19 = add i64 %12, %18
; │││││└
%20 = getelementptr inbounds float, float addrspace(1)* %.unpack12, i64 %19
store float %15, float addrspace(1)* %20, align 4
; └└└└└
; @ REPL[6]:4 within `copy1d`
ret void
}
julia> function run1d_static()
@Metal.sync begin
@metal threads=1024 grid=cld(nn, 1024) copy1d(
# hardcode for now
SizedArray{Tuple{16384, 16384}}(a),
SizedArray{Tuple{16384, 16384}}(b), Val(axes(a)))
end
end
run1d_static (generic function with 1 method)
julia> @device_code_llvm run1d_static()
; CompilerJob of kernel #copy1d(SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, Val{(Base.OneTo(16384), Base.OneTo(16384))}) for GPUCompiler.MetalCompilerTarget
; @ REPL[6]:1 within `copy1d`
define cc103 void @_Z17julia_copy1d_328910SizedArrayI5TupleILi16384ELi16384EE7Float32Li2ELi2E8MtlArrayIS1_Li2EEES_IS0_ILi16384ELi16384EES1_Li2ELi2ES2_IS1_Li2EEE3ValI38_Base_OneTo_16384___Base_OneTo_16384__E([1 x {}*] addrspace(1)* %0, [1 x {}*] addrspace(1)* %1, i32 %thread_position_in_grid) local_unnamed_addr #0 {
conversion:
%2 = getelementptr [1 x {}*], [1 x {}*] addrspace(1)* %0, i64 0, i64 0
%.unpack = load {}*, {}* addrspace(1)* %2, align 8
%3 = getelementptr [1 x {}*], [1 x {}*] addrspace(1)* %1, i64 0, i64 0
%.unpack3 = load {}*, {}* addrspace(1)* %3, align 8
; @ REPL[6]:2 within `copy1d`
; ┌ @ /Users/kichi/.julia/dev/Metal/src/device/intrinsics/arguments.jl:48 within `thread_position_in_grid_1d`
; │┌ @ int.jl:87 within `+`
%4 = add i32 %thread_position_in_grid, 1
; └└
; ┌ @ abstractarray.jl:1241 within `getindex`
; │┌ @ indices.jl:330 within `to_indices` @ indices.jl:333
; ││┌ @ indices.jl:277 within `to_index` @ indices.jl:292
; │││┌ @ number.jl:7 within `convert`
; ││││┌ @ boot.jl:764 within `Int64`
; │││││┌ @ boot.jl:688 within `toInt64`
%5 = zext i32 %4 to i64
; │└└└└└
; │┌ @ abstractarray.jl:1286 within `_getindex`
; ││┌ @ abstractarray.jl:1293 within `_to_subscript_indices`
; │││┌ @ abstractarray.jl:1315 within `_unsafe_ind2sub`
; ││││┌ @ abstractarray.jl:2639 within `_ind2sub` @ abstractarray.jl:2677
; │││││┌ @ int.jl:86 within `-`
%6 = add nsw i64 %5, -1
; │││││└
; │││││┌ @ abstractarray.jl:2690 within `_ind2sub_recurse`
; ││││││┌ @ abstractarray.jl:2697 within `_div`
; │││││││┌ @ int.jl:288 within `div`
%7 = sdiv i64 %6, 16384
; ││││││└└
; ││││││ @ abstractarray.jl:2691 within `_ind2sub_recurse`
; ││││││┌ @ int.jl:88 within `*`
%.neg = mul nsw i64 %7, -16384
; ││││││└
; ││││││┌ @ int.jl:87 within `+`
%8 = add nsw i64 %.neg, %5
; └└└└└└└
; @ REPL[6]:3 within `copy1d`
; ┌ @ abstractarray.jl:1241 within `getindex`
; │┌ @ abstractarray.jl:1274 within `_getindex`
; ││┌ @ abstractarray.jl:1280 within `_to_linear_index`
; │││┌ @ abstractarray.jl:2634 within `_sub2ind` @ abstractarray.jl:2650
; ││││┌ @ abstractarray.jl:2666 within `_sub2ind_recurse` @ abstractarray.jl:2666
; │││││┌ @ int.jl:88 within `*`
%9 = shl nuw nsw i64 %7, 14
; │││││└
; │││││┌ @ int.jl:87 within `+`
%10 = add nsw i64 %8, %9
; ││└└└└
; ││┌ @ /Users/kichi/.julia/packages/StaticArrays/QPt9n/src/SizedArray.jl:92 within `getindex` @ /Users/kichi/.julia/dev/GPUArrays/src/host/indexing.jl:9
call fastcc void @julia_assertscalar_3551({}* inttoptr (i64 4510252144 to {}*)) #0
; │││ @ /Users/kichi/.julia/packages/StaticArrays/QPt9n/src/SizedArray.jl:92 within `getindex` @ /Users/kichi/.julia/dev/GPUArrays/src/host/indexing.jl:11
; │││┌ @ boot.jl:473 within `Array` @ boot.jl:459
%11 = call nonnull {}* inttoptr (i64 4373202372 to {}* ({}*, i64)*)({}* nonnull inttoptr (i64 4767457904 to {}*), i64 1)
; │││└
; │││ @ /Users/kichi/.julia/packages/StaticArrays/QPt9n/src/SizedArray.jl:92 within `getindex` @ /Users/kichi/.julia/dev/GPUArrays/src/host/indexing.jl:12
call fastcc void @julia_copyto__3721({}* %11, i64 signext 1, {}* %.unpack3, i64 signext %10, i64 signext 1) #0
; │││ @ /Users/kichi/.julia/packages/StaticArrays/QPt9n/src/SizedArray.jl:92 within `getindex` @ /Users/kichi/.julia/dev/GPUArrays/src/host/indexing.jl:13 @ array.jl:924
%12 = bitcast {}* %11 to { i8*, i64, i16, i16, i32 }*
%13 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %12, i64 0, i32 1
%14 = load i64, i64* %13, align 8
%.not = icmp eq i64 %14, 0
br i1 %.not, label %oob, label %idxend
oob: ; preds = %conversion
call fastcc void @gpu_report_exception()
call fastcc void @gpu_signal_exception()
call void @llvm.trap()
unreachable
idxend: ; preds = %conversion
%15 = bitcast {}* %11 to float**
%16 = load float*, float** %15, align 8
%17 = load float, float* %16, align 4
%18 = icmp ne {}* %.unpack, null
call void @llvm.assume(i1 %18)
; └└└
; ┌ @ abstractarray.jl:1344 within `setindex!`
; │┌ @ abstractarray.jl:1367 within `_setindex!`
; ││┌ @ /Users/kichi/.julia/packages/StaticArrays/QPt9n/src/SizedArray.jl:93 within `setindex!` @ /Users/kichi/.julia/dev/GPUArrays/src/host/indexing.jl:17
call fastcc void @julia_assertscalar_3551({}* inttoptr (i64 4513126576 to {}*)) #0
; │││ @ /Users/kichi/.julia/packages/StaticArrays/QPt9n/src/SizedArray.jl:93 within `setindex!` @ /Users/kichi/.julia/dev/GPUArrays/src/host/indexing.jl:19
; │││┌ @ array.jl:404 within `getindex`
; ││││┌ @ boot.jl:459 within `Array`
%19 = call nonnull {}* inttoptr (i64 4373202372 to {}* ({}*, i64)*)({}* nonnull inttoptr (i64 4767457904 to {}*), i64 1)
; ││││└
; ││││ @ array.jl:407 within `getindex`
; ││││┌ @ array.jl:966 within `setindex!`
%20 = bitcast {}* %19 to float**
%21 = load float*, float** %20, align 8
store float %17, float* %21, align 4
; │││└└
; │││ @ /Users/kichi/.julia/packages/StaticArrays/QPt9n/src/SizedArray.jl:93 within `setindex!` @ /Users/kichi/.julia/dev/GPUArrays/src/host/indexing.jl:20
call fastcc void @julia_copyto__3309({}* %.unpack, i64 signext %10, {}* %19, i64 signext 1, i64 signext 1) #0
; └└└
; @ REPL[6]:4 within `copy1d`
ret void
}
ERROR: GPU compilation of kernel #copy1d(SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, Val{(Base.OneTo(16384), Base.OneTo(16384))}) failed
KernelError: passing and using non-bitstype argument
Argument 2 to your kernel function is of type SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, which is not isbits:
.data is of type MtlMatrix{Float32} which is not isbits.
Stacktrace:
[1] check_invocation(job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/.julia/dev/GPUCompiler/src/validation.jl:88
[2] macro expansion
@ ~/.julia/dev/GPUCompiler/src/driver.jl:154 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
[4] macro expansion
@ ~/.julia/dev/GPUCompiler/src/driver.jl:152 [inlined]
[5] emit_julia(job::GPUCompiler.CompilerJob; validate::Bool)
@ GPUCompiler ~/.julia/dev/GPUCompiler/src/utils.jl:83
[6] emit_julia
@ ~/.julia/dev/GPUCompiler/src/utils.jl:77 [inlined]
[7] mtlfunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
@ Metal ~/.julia/dev/Metal/src/compiler/execution.jl:165
[8] #39
@ ~/.julia/dev/Metal/src/compiler/execution.jl:161 [inlined]
[9] JuliaContext(f::Metal.var"#39#40"{GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams, GPUCompiler.FunctionSpec{typeof(copy1d), Tuple{SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, Val{(Base.OneTo(16384), Base.OneTo(16384))}}}}})
@ GPUCompiler ~/.julia/dev/GPUCompiler/src/driver.jl:76
[10] mtlfunction_compile(job::GPUCompiler.CompilerJob)
@ Metal ~/.julia/dev/Metal/src/compiler/execution.jl:160
[11] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(Metal.mtlfunction_compile), linker::typeof(Metal.mtlfunction_link))
@ GPUCompiler ~/.julia/dev/GPUCompiler/src/cache.jl:90
[12] mtlfunction(f::typeof(copy1d), tt::Type{Tuple{SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, Val{(Base.OneTo(16384), Base.OneTo(16384))}}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Metal ~/.julia/dev/Metal/src/compiler/execution.jl:148
[13] mtlfunction(f::typeof(copy1d), tt::Type{Tuple{SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, SizedMatrix{16384, 16384, Float32, 2, MtlMatrix{Float32}}, Val{(Base.OneTo(16384), Base.OneTo(16384))}}})
@ Metal ~/.julia/dev/Metal/src/compiler/execution.jl:141
[14] macro expansion
@ ~/.julia/dev/Metal/src/compiler/execution.jl:64 [inlined]
[15] macro expansion
@ ./REPL[9]:3 [inlined]
[16] macro expansion
@ ~/.julia/dev/Metal/src/utilities.jl:10 [inlined]
[17] run1d_static()
@ Main ./REPL[9]:2
[18] top-level scope
@ ~/.julia/dev/GPUCompiler/src/reflection.jl:205
[19] top-level scope
@ ~/.julia/dev/Metal/src/initialization.jl:25
julia> function copy1d_manual(a, b)
pos = thread_position_in_grid_1d() - 1
r = (pos & 16383) + 1
c = (pos >> 14) + 1
@inbounds a[r,c] = b[r,c]
return
end
copy1d_manual (generic function with 1 method)
julia> function run1d_manual()
@Metal.sync begin
@metal threads=1024 grid=cld(nn, 1024) copy1d_manual(a, b)
end
end
run1d_manual (generic function with 1 method)
julia> @device_code_llvm run1d_manual()
; CompilerJob of kernel #copy1d_manual(MtlDeviceMatrix{Float32, 1}, MtlDeviceMatrix{Float32, 1}) for GPUCompiler.MetalCompilerTarget
; @ REPL[11]:1 within `copy1d_manual`
define cc103 void @_Z24julia_copy1d_manual_398214MtlDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EE({ i8 addrspace(1)*, [2 x i64] } addrspace(1)* %0, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %1, i32 %thread_position_in_grid) local_unnamed_addr #0 {
conversion:
%2 = bitcast { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %0 to float addrspace(1)* addrspace(1)*
%.unpack12 = load float addrspace(1)*, float addrspace(1)* addrspace(1)* %2, align 8
%.unpack8.elt = getelementptr inbounds { i8 addrspace(1)*, [2 x i64] }, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %0, i64 0, i32 1, i64 0
%.unpack8.unpack = load i64, i64 addrspace(1)* %.unpack8.elt, align 8
%3 = bitcast { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %1 to float addrspace(1)* addrspace(1)*
%.unpack18 = load float addrspace(1)*, float addrspace(1)* addrspace(1)* %3, align 8
%.unpack14.elt = getelementptr inbounds { i8 addrspace(1)*, [2 x i64] }, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %1, i64 0, i32 1, i64 0
%.unpack14.unpack = load i64, i64 addrspace(1)* %.unpack14.elt, align 8
; @ REPL[11]:2 within `copy1d_manual`
; ┌ @ /Users/kichi/.julia/dev/Metal/src/device/intrinsics/arguments.jl:48 within `thread_position_in_grid_1d`
; │┌ @ int.jl:87 within `+`
%4 = add i32 %thread_position_in_grid, 1
; └└
; ┌ @ int.jl:989 within `-`
; │┌ @ int.jl:518 within `rem`
; ││┌ @ number.jl:7 within `convert`
; │││┌ @ boot.jl:764 within `Int64`
; ││││┌ @ boot.jl:688 within `toInt64`
%5 = zext i32 %4 to i64
; │└└└└
; │ @ int.jl:991 within `-` @ int.jl:86
%6 = add nsw i64 %5, -1
; └
; @ REPL[11]:3 within `copy1d_manual`
; ┌ @ int.jl:340 within `&`
%7 = and i64 %6, 16383
; └
; @ REPL[11]:4 within `copy1d_manual`
; ┌ @ int.jl:501 within `>>` @ int.jl:494
%8 = ashr i64 %6, 14
; └
; @ REPL[11]:5 within `copy1d_manual`
; ┌ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:115 within `getindex`
; │┌ @ abstractarray.jl:1280 within `_to_linear_index`
; ││┌ @ abstractarray.jl:2634 within `_sub2ind`
; │││┌ @ abstractarray.jl:95 within `axes`
; ││││┌ @ tuple.jl:222 within `map`
; │││││┌ @ range.jl:455 within `oneto`
; ││││││┌ @ range.jl:453 within `OneTo` @ range.jl:440
; │││││││┌ @ promotion.jl:488 within `max`
; ││││││││┌ @ essentials.jl:489 within `ifelse`
%9 = icmp sgt i64 %.unpack14.unpack, 0
%10 = select i1 %9, i64 %.unpack14.unpack, i64 0
; │││└└└└└└
; │││ @ abstractarray.jl:2634 within `_sub2ind` @ abstractarray.jl:2650
; │││┌ @ abstractarray.jl:2666 within `_sub2ind_recurse` @ abstractarray.jl:2666
; ││││┌ @ int.jl:88 within `*`
%11 = mul i64 %10, %8
; │└└└└
; │ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:115 within `getindex` @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:103
; │┌ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:82 within `arrayref`
; ││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:81 within `unsafe_load`
; │││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:9 within `pointerref`
; ││││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:9 within `macro expansion` @ /Users/kichi/.julia/dev/LLVM/src/interop/base.jl:40
; │││││┌ @ int.jl:86 within `-`
%12 = add i64 %11, %7
; │││││└
%13 = getelementptr inbounds float, float addrspace(1)* %.unpack18, i64 %12
%14 = load float, float addrspace(1)* %13, align 4
; └└└└└
; ┌ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:118 within `setindex!`
; │┌ @ abstractarray.jl:1280 within `_to_linear_index`
; ││┌ @ abstractarray.jl:2634 within `_sub2ind`
; │││┌ @ abstractarray.jl:95 within `axes`
; ││││┌ @ tuple.jl:222 within `map`
; │││││┌ @ range.jl:455 within `oneto`
; ││││││┌ @ range.jl:453 within `OneTo` @ range.jl:440
; │││││││┌ @ promotion.jl:488 within `max`
; ││││││││┌ @ essentials.jl:489 within `ifelse`
%15 = icmp sgt i64 %.unpack8.unpack, 0
%16 = select i1 %15, i64 %.unpack8.unpack, i64 0
; │││└└└└└└
; │││ @ abstractarray.jl:2634 within `_sub2ind` @ abstractarray.jl:2650
; │││┌ @ abstractarray.jl:2666 within `_sub2ind_recurse` @ abstractarray.jl:2666
; ││││┌ @ int.jl:88 within `*`
%17 = mul i64 %16, %8
; │└└└└
; │ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:118 within `setindex!` @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:105
; │┌ @ /Users/kichi/.julia/dev/Metal/src/device/array.jl:88 within `arrayset`
; ││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:84 within `unsafe_store!`
; │││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:44 within `pointerset`
; ││││┌ @ /Users/kichi/.julia/dev/LLVM/src/interop/pointer.jl:44 within `macro expansion` @ /Users/kichi/.julia/dev/LLVM/src/interop/base.jl:40
; │││││┌ @ int.jl:86 within `-`
%18 = add i64 %17, %7
; │││││└
%19 = getelementptr inbounds float, float addrspace(1)* %.unpack12, i64 %18
store float %14, float addrspace(1)* %19, align 4
; └└└└└
; @ REPL[11]:6 within `copy1d_manual`
ret void
}
julia> function copy2d(a, b)
(i, j) = thread_position_in_grid_2d()
@inbounds a[i, j] = b[i, j]
return
end
copy2d (generic function with 1 method)
julia> function run2d()
@Metal.sync begin
@metal threads=(32,32) grid=(cld(n,32), cld(n,32)) copy2d(a, b)
end
end
run2d (generic function with 1 method)
julia> function broadcast()
@Metal.sync begin
global a .= b
end
end
broadcast (generic function with 1 method)
julia> @btime run2d() # optimal
11.918 ms (133 allocations: 3.89 KiB)
Metal.HostKernel{typeof(copy2d), Tuple{MtlDeviceMatrix{Float32, 1}, MtlDeviceMatrix{Float32, 1}}}(copy2d, MtlFunction(_Z17julia_copy2d_421114MtlDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EE), MtlComputePipelineState(Ptr{Metal.cmt.MtComputePipelineState} @0x00000001087e3900, MtlDevice(Apple M1 Pro)))
julia> @btime broadcast(); # current a .= b
101.609 ms (196 allocations: 5.95 KiB)
julia> @btime run1d()
91.411 ms (140 allocations: 3.95 KiB)
Metal.HostKernel{typeof(copy1d), Tuple{MtlDeviceMatrix{Float32, 1}, MtlDeviceMatrix{Float32, 1}, Val{(Base.OneTo(16384), Base.OneTo(16384))}}}(copy1d, MtlFunction(_Z17julia_copy1d_292514MtlDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EE3ValI38_Base_OneTo_16384___Base_OneTo_16384__E), MtlComputePipelineState(Ptr{Metal.cmt.MtComputePipelineState} @0x0000000157f9aad0, MtlDevice(Apple M1 Pro)))
julia> @btime run1d_manual()
11.760 ms (131 allocations: 3.77 KiB)
Metal.HostKernel{typeof(copy1d_manual), Tuple{MtlDeviceMatrix{Float32, 1}, MtlDeviceMatrix{Float32, 1}}}(copy1d_manual, MtlFunction(_Z24julia_copy1d_manual_401214MtlDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EE), MtlComputePipelineState(Ptr{Metal.cmt.MtComputePipelineState} @0x000000016f9a5ed0, MtlDevice(Apple M1 Pro)))
julia>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment