Skip to content

Instantly share code, notes, and snippets.

@abadams
abadams / averaging_tree.py
Last active September 17, 2021 19:02
z3py query to solve for an averaging tree
import z3
#kernel = [1, 2, 1]
#ops = 2
kernel = [1, 4, 6, 4, 1]
ops = 10
kernel_sum = sum(kernel)
add x9, x8, x17
ldr d26, [x8]
add x13, x29, x15
ldr d28, [x9]
add x9, x9, x17
ld1r { v29.4s }, [x13]
sub x13, x13,
ldr d27, [x9]
add x9, x9, x17
ld1r { v30.4s }, [x13]
@abadams
abadams / gist:3a94fe455c5053b01e80d25342568fdf
Created May 13, 2019 18:17
halide convolution lowered IR
assert((reinterpret(uint64, tent.buffer) != (uint64)0), halide_error_buffer_argument_is_null("tent"))
assert((reinterpret(uint64, in.buffer) != (uint64)0), halide_error_buffer_argument_is_null("in"))
assert((reinterpret(uint64, blur2.buffer) != (uint64)0), halide_error_buffer_argument_is_null("blur2"))
let blur2 = _halide_buffer_get_host(blur2.buffer)
let blur2.min.0 = _halide_buffer_get_min(blur2.buffer, 0)
let blur2.extent.0 = _halide_buffer_get_extent(blur2.buffer, 0)
let blur2.stride.0 = _halide_buffer_get_stride(blur2.buffer, 0)
let blur2.min.1 = _halide_buffer_get_min(blur2.buffer, 1)
let blur2.extent.1 = _halide_buffer_get_extent(blur2.buffer, 1)
let blur2.stride.1 = _halide_buffer_get_stride(blur2.buffer, 1)
Attempting to disprove: (((1 == 1) && ((7 == 7) && ((4 == 4) && (g.s0.y.y > g.s0.y.y$8)))) && (((((uint1)1 && (f.s0.x == g.s0.x$1)) && (f.s0.y == (((g.s0.y.y$8*5) + g.min.1) + g.s0.y.v10$1))) && (let f.s0.x.loop_extent = max((100 - g.min.0), g.extent.0) in (((((((uint1)1 && (g.s0.y.y >= 0)) && (g.s0.y.y < (0 + ((g.extent.1 + 4)/5)))) && (f.s0.y >= ((g.s0.y.y*5) + g.min.1))) && (f.s0.y < (((g.s0.y.y*5) + g.min.1) + 5))) && (f.s0.x >= g.min.0)) && (f.s0.x < (g.min.0 + f.s0.x.loop_extent))))) && (((((((((((((uint1)1 && (g.s0.y.y$8 >= 0)) && (g.s0.y.y$8 < (0 + ((g.extent.1 + 4)/5)))) && (f.s0.y >= ((g.s0.y.y$8*5) + g.min.1))) && (f.s0.y < (((g.s0.y.y$8*5) + g.min.1) + 5))) && (f.s1.y >= ((g.s0.y.y$8*5) + g.min.1))) && (f.s1.y < (((g.s0.y.y$8*5) + g.min.1) + 5))) && (f.s2.y >= ((g.s0.y.y$8*5) + g.min.1))) && (f.s2.y < (((g.s0.y.y$8*5) + g.min.1) + 5))) && (g.s0.y.v10$1 >= 0)) && (g.s0.y.v10$1 < (0 + 5))) && (g.s0.x$1 >= g.min.0)) && (g.s0.x$1 < (g.min.0 + g.extent.0)))))
Expr: (((((((((0 <= g.s0.y.y) && (g.s0.y.y
set -x
# Build the generator to autotune. This script will be autotuning the
# autoscheduler's cost model training pipeline, which is large enough
# to be interesting.
GENERATOR=./cost_model.generator
PIPELINE=halide_autoscheduler_train_cost_model
c++ ../../src/CostModelGenerator.cpp -std=c++11 -DCOMPILING_SEPARATELY -I ../../include/ ../../lib/libHalide.a -o cost_model.generator -ldl -lpthread -lz -fno-rtti -fno-exceptions ../../tools/GenGen.cpp
# Make sure Halide is built
@abadams
abadams / gist:8614cc69d80b4a5b014f34bfcfd30246
Created November 13, 2018 18:31
camera pipe loop nest
realize: processed
processed 2 40 1 t
realize: b_b
realize: f3
realize: f1
realize: r_b
realize: r_r
realize: g_r
realize: g_gr
realize: g_gb
@abadams
abadams / gist:cfd82f98306dd40eeb875f25282c172f
Created November 13, 2018 18:29
autoscheduled camera pipe
module name=processed, target=x86-64-linux-avx-avx2-f16c-fma-no_asserts-no_bounds_query-no_runtime-sse41
external_plus_metadata func processed (input, matrix_3200, matrix_7000, black_level, color_temp, contrast, gamma, sharpen_strength, white_level, processed) {
assert((reinterpret(uint64, processed.buffer) != (uint64)0), halide_error_buffer_argument_is_null("processed"))
assert((reinterpret(uint64, matrix_7000.buffer) != (uint64)0), halide_error_buffer_argument_is_null("matrix_7000"))
assert((reinterpret(uint64, matrix_3200.buffer) != (uint64)0), halide_error_buffer_argument_is_null("matrix_3200"))
assert((reinterpret(uint64, input.buffer) != (uint64)0), halide_error_buffer_argument_is_null("input"))
let input = _halide_buffer_get_host(input.buffer)
let input.min.0 = _halide_buffer_get_min(input.buffer, 0)
let input.stride.0 = _halide_buffer_get_stride(input.buffer, 0)
let input.min.1 = _halide_buffer_get_min(input.buffer, 1)
@abadams
abadams / gist:05afed8803587ed83a121868d97e7e60
Created September 12, 2018 20:58
profiler + hardware counters
halide_sgemm_notrans
total time: 3508.201904 ms samples: 3198 runs: 155 time/run: 22.633560 ms
average threads used: 19.534710
heap allocations: 155 peak heap usage: 17305600 bytes
As: 0.700ms (3%) threads: 14.474 peak: 17305600 num: 155 avg: 17305600
IPC: 1.96 L1: 231707996 (77%) LLC: 1912567 (74%)
Atmp: 0.505ms (2%) threads: 17.577 stack: 1024
IPC: 2.11 L1: 194752315 (80%) LLC: 1802731 (81%)
result: 7.521ms (33%) threads: 19.817
IPC: 3.14 L1: 2345267500 (79%) LLC: 10048920 (94%)
Type 1:
def fun(float(N) X, float(N) Y) -> (Z, W) {
tmp(i) = X(i) + 4
Z(i) = tmp(i)
W(i) = Y(i) + 4
}
Type 2:
@abadams
abadams / gist:422d7ea52c8a7320bc6cd632ec02e50d
Created May 8, 2018 14:44
Probably true things that the Halide simplifier can't prove
(max(((17 - (v0 % 4))/4), ((max(((min((v1*2), ((v0 % 2) + 5)) + (v0/2))/2), ((v0/4) + v1)) - (((v0 + -6)/4) + v1)) + 2)) <= 1024)
(max(((17 - (v0 % 4))/4), ((((min((v1*2), ((v0 % 2) + 20)) + (v0/2))/2) - (((v0 + -6)/4) + v1)) + 2)) <= 1024)
(max(((8 - (((v0*161) + v1) % 2))/2), ((((min((v2*2), 159) + ((v0*161) + v1))/2) - (((((v0*161) + v1) + -3)/2) + v2)) + 2)) <= 1024)
(max(max(min(v0, 12), (min(min(min(min(min(min(min(min(min(min(min(min(min(min(min(((v1 + -15)/16), (v2 + -1)), ((v1 + -14)/16)), ((v1 + -13)/16)), ((v1 + -2)/16)), ((v1 + -1)/16)), ((v1 + 1)/16)), ((v3 + -2)/16)), ((v3 + -1)/16)), ((v4 + -2)/16)), ((v4 + -1)/16)), ((v5 + -2)/16)), ((v5 + -1)/16)), ((v6 + -2)/16)), ((v6 + -1)/16)), 11) + 1)), 0) <= max(min(v0, 12), 0))
(max(max(min(((v0 + 131)/144), 2), (min(((min((min(min(min(min(min(min((min((((v1 + v2) + 25)/2), v3) - ((v4*288) + v5)), v6), (min((((v1 + v2) + 29)/2), v3) - ((v4*288) + v5))), (v7 + 13)), v8), v9), (v7 + 15)) + 128), v8) + -140)/144), 1) + 1)), 0) <= max(min(((v0 + 131)