Skip to content

Instantly share code, notes, and snippets.

View makslevental's full-sized avatar
💩

Maksim Levental makslevental

💩
View GitHub Profile
CCACHE_CPP2=yes CCACHE_HASHDIR=yes /usr/local/bin/ccache /usr/local/opt/ccache/libexec/c++ -D_DEBUG -D_GLIBCXX_ASSERTIONS -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/Users/runner/work/llvm-aie/llvm-aie/build/temp/utils/TableGen -I/Users/runner/work/llvm-aie/llvm-aie/llvm-aie/llvm/utils/TableGen -I/Users/runner/work/llvm-aie/llvm-aie/build/temp/include -I/Users/runner/work/llvm-aie/llvm-aie/llvm-aie/llvm/include -I/Users/runner/work/llvm-aie/llvm-aie/llvm-aie/llvm/utils/TableGen/GlobalISel/.. -fPIC -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -O3 -DNDEBUG -std=c++17 -arch x86_64 -isys
from __future__ import annotations
import contextlib
import math
import cupy as cp
import mlir.extras.types as T
import numpy as np
from cupy.cuda import Module
def sgemm_shared_mem_block[
M, K, N, dtype, BLOCK_SIZE
](A: T.memref(M, K, dtype), B: T.memref(K, N, dtype), C: T.memref(M, N, dtype)):
# allocate buffer for current block in fast shared mem
# shared mem is shared between all threads in a block
base = gpu.dynamic_shared_memory()
A_shared = memref.view(base, (BLOCK_SIZE, BLOCK_SIZE), dtype=dtype)
B_shared = memref.view(
base, (BLOCK_SIZE, BLOCK_SIZE), dtype=dtype, shift=BLOCK_SIZE * BLOCK_SIZE
)
def get_localsplus_name_to_idx(code: types.CodeType):
localsplus = code.co_varnames + code.co_cellvars + code.co_freevars
return localsplus, {v: i for i, v in enumerate(localsplus)}
def print_local_derefs(code):
code = ConcreteBytecode.from_code(code)
for c in code:
if "LOAD_DEREF" in str(c):
print(c)
@makslevental
makslevental / mlir.mlir
Last active April 15, 2024 20:39
comparison mlir vs nvcc
module attributes {gpu.container_module} {
gpu.module @naive [#nvvm.target] {
gpu.func @mat_product_kernel(%arg0: memref<2048x2048xf32>, %arg1: memref<2048x2048xf32>, %arg2: memref<2048x2048xf32>) kernel {
%block_dim_x = gpu.block_dim x
%block_id_x = gpu.block_id x
%0 = arith.muli %block_dim_x, %block_id_x : index
%thread_id_x = gpu.thread_id x
%1 = arith.addi %0, %thread_id_x : index
%block_dim_y = gpu.block_dim y
%block_id_y = gpu.block_id y
@aie.device(AIEDevice.ipu)
def ipu():
tile_0_0 = aie.tile(0, 0)
tile_0_1 = aie.tile(0, 1)
tile_0_2 = aie.tile(0, 2)
tile_1_0 = aie.tile(1, 0)
tile_2_0 = aie.tile(2, 0)
tile_3_0 = aie.tile(3, 0)
def test_single_prod_mult_cons_only_two_locks(ctx: MLIRContext, workdir: Path):
K = 32
iters = 20
ipu_insts = aiex.ipu.get_prolog()
shim_channels = {}
@aie.device(AIEDevice.ipu)
# Due to pipelining, the DMA will issue the acquire request from the second BD before it issues the release request from the first one (edited)
# read the full AB and then stride
in_AB_lock = aie.lock(
t.tile,
init=fat_AB_rows_per_memtile_per_round,
sym_name=f"in_AB_lock_{int(t.tile.col)}",
)
out_AB_lock = aie.lock(
t.tile, init=0, sym_name=f"out_AB_lock_{int(t.tile.col)}"
)
// File generated by darts version U-2022.12#3eec2545bc#230622, Tue Mar 19 11:31:11 2024
// Copyright 2014-2022 Synopsys, Inc. All rights reserved.
// darts -I/opt/tools/Xilinx/Vitis/2023.2/aietools/data/aie_ml/lib -d -h -I/opt/tools/Xilinx/Vitis/2023.2/aietools/data/aie_ml/lib/runtime_cxx/libcxx-lite/include -I/opt/tools/Xilinx/Vitis/2023.2/aietools/data/aie_ml/lib/runtime_cxx/libs/libcxx-9.0.0/include-lite -I/opt/tools/Xilinx/Vitis/2023.2/aietools/data/aie_ml/lib/runtime/include -I<BOOST_DIR> -D__AIENGINE__ -D__AIE_ARCH__=20 -D__AIEARCH__=20 -D__tct_tgt__=230622 -L +Ihex +nanno +u core_0_2.elf me
// Release: ipp U-2022.12-TGT-230622
.text_segment PM 0
.entry_point
.label __AIE_ARCH_MODEL_VERSION__20010300__inlined__1__me_basic___main_init_
aie.device(ipu) {
%tile_0_0 = tile(0, 0)
%tile_0_1 = tile(0, 1)
%tile_0_2 = tile(0, 2)
%buffer = buffer(%tile_0_1) : memref<64xi32>
%input_lock = lock(%tile_0_1) {init = 0}
%output_lock = lock(%tile_0_1) {init = 0}