Skip to content

Instantly share code, notes, and snippets.

@gbaraldi
gbaraldi / f.ll
Created January 16, 2024 20:52
Register Spills
; ModuleID = 'f'
source_filename = "f"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-apple-darwin23.2.0"
@"+Core.Tuple#1439" = private unnamed_addr constant ptr @"+Core.Tuple#1439.jit", !julia.constgv !0
@"+Core.Tuple#1439.jit" = private alias ptr, inttoptr (i64 4529264784 to ptr)
; Function Signature: f(NTuple{50, Int64}, Int64)
This file has been truncated, but you can view the full file.
I[10:48:55.490] clangd version 17.0.3 (https://github.com/llvm/llvm-project 888437e1b60011b8a375dd30928ec925b448da57)
I[10:48:55.490] Features: mac+grpc+xpc
I[10:48:55.490] PID: 2605
I[10:48:55.490] Working directory: /Users/gabrielbaraldi/julia
I[10:48:55.490] argv[0]: /Users/gabrielbaraldi/Library/Application Support/Code - Insiders/User/globalStorage/llvm-vs-code-extensions.vscode-clangd/install/17.0.3/clangd_17.0.3/bin/clangd
I[10:48:55.490] argv[1]: --enable-config
I[10:48:55.491] Starting LSP over stdin/stdout
I[10:48:55.491] <-- initialize(0)
I[10:48:55.499] --> reply:initialize(0) 8 ms
I[10:48:55.500] <-- initialized
@gbaraldi
gbaraldi / llvm
Last active January 3, 2024 20:47
memcmp generated by llvm vs ours
Iterations: 100
Instructions: 1400
Total Cycles: 1403
Total uOps: 1800
Dispatch Width: 6
uOps Per Cycle: 1.28
IPC: 1.00
Block RThroughput: 7.0
@gbaraldi
gbaraldi / gist:8504833053d6c77865379dedc6824130
Last active January 1, 2024 08:00
M3 max 36gb output!
[2, 3, 4, 8, 9, 10, 16, 27, 32, 64, 81, 100, 128, 243, 256, 512, 729, 1000, 1024, 2048, 2187, 4096, 6561, 8192, 10000, 16384, 19683, 32768]
[6.406406406406406e-06, 0.0009810690018531303, 0.02457757296466974, 0.00036613377040552175, 0.05841814247936533, 0.15335071308081583, 0.31059715639810426, 2.2765440666204024, 3.7718561151079135, 1.1752042588960494, 5.866863170444948, 2.9560261549194187, 1.8095471067249096, 5.087776863956844, 11.97214693789007, 12.792844578413735, 47.16240600277262, 45.73406059941392, 65.07148199906084, 80.04057586790874, 205.91364619739895, 174.1874089125134, 354.94201641466566, 599.6347164155466, 748.8818960202927, 619.0967398250291, 774.0532000340861, 632.1859241014777]
2023,Macbook Pro 14'inch, Apple M3 Max, 36 GBs
@gbaraldi
gbaraldi / new.s
Last active December 27, 2023 20:46
Benchmarks
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 14, 0
.globl "_julia_perf_manual_example!_46294" ; -- Begin function julia_perf_manual_example!_46294
.p2align 2
"_julia_perf_manual_example!_46294": ; @"julia_perf_manual_example!_46294"
; Function Signature: perf_manual_example!(Base.SubArray{Float32, 3, Array{Float32, 3}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}, false}, Base.SubArray{Float32, 3, Array{Float32, 3}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}, false}, Base.SubArray{Float32, 3, Array{Float32, 3}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}, false}, Base.IteratorsMD.CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}})
.cfi_startproc
; %bb.0: ; %top
;DEBUG_VALUE: perf_manual_example!:X <- [DW_OP_deref] [$x0+0]
;DEBUG_VALUE: perf_manual_example!:X <- [DW_OP_deref] [$x0+0]
; Function Signature: perf_sumeach_view(Base.BitArray{2})
define { ptr, i8 } @julia_perf_sumeach_view_49258(ptr noalias nocapture noundef nonnull align 8 dereferenceable(8) %union_bytes_return, ptr noundef nonnull align 8 dereferenceable(32) %"A::BitArray") #0 {
top:
%jlcallframe1 = alloca [4 x ptr], align 8
%gcframe2 = alloca [4 x ptr], align 16
call void @llvm.memset.p0.i64(ptr align 16 %gcframe2, i8 0, i64 32, i1 true)
%thread_ptr = call ptr asm "movq %fs:0, $0", "=r"() #12
%tls_ppgcstack = getelementptr i8, ptr %thread_ptr, i64 -8
%tls_pgcstack = load ptr, ptr %tls_ppgcstack, align 8
store i64 8, ptr %gcframe2, align 16
# This file is machine-generated - editing it directly is not advised
julia_version = "1.10.0-rc1"
manifest_format = "2.0"
project_hash = "acb9f21a4f584e84960b90ee7ee41028ca84e530"
[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "02f731463748db57cc2ebfbd9fbc9ce8280d3433"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@gbaraldi
gbaraldi / juliac.jl
Created November 14, 2023 20:50
Invoking julia `juliac.jl script.jl test` in a command line gives one a runnable executable (bar rpath nonsense that I haven't implemented)
import Base: MethodInstance
using Base.Linking
Base.@ccallable function c_main(argc::Cint, argv::Ptr{Ptr{Cchar}})::Cint
argv = ccall(:uv_setup_args, Ptr{Ptr{Cchar}}, (Cint, Ptr{Ptr{Cchar}}), argc, argv)
argcref = Ref{Cint}(argc)
argvref = Ref{Ptr{Ptr{Cchar}}}(argv)
ccall(:jl_parse_opts, Cvoid, (Ptr{Cint}, Ptr{Ptr{Ptr{Cchar}}}), argcref, argvref)
ccall(:julia_init, Cvoid, (Cint,), 2)
; ModuleID = '#682#f'
source_filename = "#682#f"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-ni:10:11:12:13"
target triple = "aarch64-apple-darwin23.0.0"
; Function Attrs: sspstrong
define swiftcc void @"julia_#682#f_8484"(ptr noalias nocapture noundef nonnull sret([8 x i64]) align 8 dereferenceable(64) %sret_return, ptr nonnull swiftself %pgcstack_arg, ptr addrspace(11) nocapture noundef nonnull readonly align 8 dereferenceable(64) %"#self#::#682#f#6", ptr addrspace(11) nocapture noundef nonnull readonly align 8 dereferenceable(64) %"b::Tuple") #0 !dbg !8 {
top:
%native_convert_buffer = alloca [8 x i64], align 8
%ccall_sret = alloca [8 x i64], align 8
This file has been truncated, but you can view the full file.
┌ Warning: Using fallback BLAS replacements, performance may be degraded
└ @ Enzyme.Compiler ~/.julia/packages/GPUCompiler/U36Ed/src/utils.jl:59
after simplification :
; Function Attrs: mustprogress willreturn
define void @preprocess_julia_integrate_1045({} addrspace(10)* noundef nonnull align 8 dereferenceable(168) %0) local_unnamed_addr #40 !dbg !1849 {
top:
%newstruct23 = alloca { [1 x [1 x i64]], i64 }, align 8
%newstruct57 = alloca [1 x [1 x i64]], align 8
%newstruct256 = alloca { [1 x [1 x i64]], i64 }, align 8
%newstruct262 = alloca [1 x [1 x i64]], align 8