-
Follow instructions in https://github.com/nod-ai/playbook/blob/main/HOWTO/tres_leches_demo.md for how to compile and run VAE 2b on CPU.
-
Get necessary input .npy files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [ | |
#hal.descriptor_set.layout<0, bindings = [ | |
#hal.descriptor_set.binding<0, storage_buffer>, | |
#hal.descriptor_set.binding<1, storage_buffer> | |
]> | |
]> | |
#config_mm = #iree_gpu.lowering_config<{workgroup = [1, 64, 64, 0], reduction = [0, 0, 0, 4], thread = [1, 8, 4, 0]}> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @jit_eval_174(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_174(%input0: tensor<1024x7x7x2xi8>) -> (%output0: tensor<1024x7x7x2xi8>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 4.000000e+00 : f32 | |
%cst_2 = arith.constant 2.000000e+00 : f32 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<1024x7x7x2xi8> | |
%1 = tensor.empty() : tensor<2x1024x7x7xi8> | |
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<1024x7x7x2xi8>) outs(%1 : tensor<2x1024x7x7xi8>) { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module { | |
func.func @main$async_dispatch_1_winograd_input_transform_11x11x16x8x8xf32() attributes {translation_info = #iree_codegen.translation_info<CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%c262144 = arith.constant 262144 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c262144) : !flow.dispatch.tensor<writeonly:tensor<11x11x16x8x8xbf16>> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>> -> tensor<16x64x64xf32> | |
%3 = tensor.empty() : tensor<11x11x16x8x8xbf16> | |
%4 = tensor.empty() : tensor<11x11x16x8x8xf32> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%expanded_127 = tensor.expand_shape %collapsed_126 [[0], [1], [2, 3]] output_shape [1, 512, 128, 128] : tensor<1x512x16384xf32> into tensor<1x512x128x128xf32> | |
%23 = tensor.empty() : tensor<512x128x128xf32> | |
%expanded_128 = tensor.expand_shape %cst_89 [[0, 1]] output_shape [1, 512] : tensor<512xf32> into tensor<1x512xf32> | |
%expanded_129 = tensor.expand_shape %cst_90 [[0, 1]] output_shape [1, 512] : tensor<512xf32> into tensor<1x512xf32> | |
%24 = tensor.empty() : tensor<1x512x128x128xf32> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_127, %expanded_128, %expanded_129 : tensor<1x512x128x128xf32>, tensor<1x512xf32>, tensor<1x512xf32>) outs(%24 : tensor<1x512x128x128xf32>) { | |
^bb0(%in: f32, %in_741: f32, %in_742: f32, %out: f32): | |
%436 = arith.mulf |
Relevant matrices are
output transform: A_6x6_3x3
, AT_6x6_3x3
input transform: B_6x6_3x3
, BT_6x6_3x3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module { | |
func.func @pack_bad(%arg0: tensor<29241x128x64xbf16>) -> tensor<64x1828x64x16x2xbf16> { | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%4 = tensor.empty() : tensor<64x1828x64x16x2xbf16> | |
%pack = tensor.pack %arg0 padding_value(%cst : bf16) outer_dims_perm = [2, 0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %4 : tensor<29241x128x64xbf16> -> tensor<64x1828x64x16x2xbf16> | |
return %pack : tensor<64x1828x64x16x2xbf16> | |
} | |
func.func @pack_good(%arg0: tensor<64x29241x128xbf16>) -> tensor<64x1828x64x16x2xbf16> { | |
%cst = arith.constant 0.000000e+00 : bf16 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} { | |
hal.executable private @main$async_dispatch_330 { | |
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module attributes {hal.device.targets = [#hal.device.target<"rocm", {legacy_sync}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>]>]} { | |
hal.executable private @conv_2d_nchw_fchw_dispatch_1 { | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>) { | |
hal.executable.export public @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x242x1280x1280_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hal.executable public @conv_2d_nchw_fchw_dispatch_1 { | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>) { | |
hal.executable.export public @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x72x1280x1280_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x72x1280x1280_f16xf16xf32() { | |
%cst = arith.constant 0.000000e+00 : f |
NewerOlder