Skip to content

Instantly share code, notes, and snippets.

@raphlinus
Last active November 6, 2022 17:04
Show Gist options
  • Save raphlinus/e355655df39c94287cc1db36f57797d2 to your computer and use it in GitHub Desktop.
Save raphlinus/e355655df39c94287cc1db36f57797d2 to your computer and use it in GitHub Desktop.
Minimal metal repro of piet-gpu#199
import Foundation
import Metal
// This is translated by naga from tile_alloc.wgsl
// naga --buffer-bounds-check-policy ReadZeroSkipWrite tile_alloc.wgsl tile_alloc.metal
let metalProgram = """
// language: metal2.0
#include <metal_stdlib>
#include <simd/simd.h>
using metal::uint;
struct DefaultConstructible {
template<typename T>
operator T() && {
return T {};
}
};
struct _mslBufferSizes {
uint size1;
};
constexpr constant unsigned WG_SIZE = 256u;
typedef uint type_2[1];
struct main_Input {
};
kernel void main_(
metal::uint3 global_id [[thread_position_in_grid]]
, metal::uint3 local_id [[thread_position_in_threadgroup]]
, device metal::atomic_uint& bump [[user(fake0)]]
, device type_2& paths [[user(fake0)]]
, threadgroup uint& sh_tile_offset
, constant _mslBufferSizes& _buffer_sizes [[user(fake0)]]
) {
uint drawobj_ix = global_id.x;
uint tile_count_in = local_id.x + 1u;
if (local_id.x == (WG_SIZE - 1u)) {
uint _e15 = metal::atomic_fetch_add_explicit(&bump, tile_count_in, metal::memory_order_relaxed);
sh_tile_offset = 1u + _e15;
}
metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
uint tile_offset = sh_tile_offset;
if (drawobj_ix < 3u) {
if (uint(drawobj_ix) < 1 + (_buffer_sizes.size1 - 0 - 4) / 4) {
paths[drawobj_ix] = tile_offset;
}
return;
} else {
return;
}
}
"""
func barrierTest() {
do {
let device = MTLCreateSystemDefaultDevice()!
let queue = device.makeCommandQueue()!
let cmdbuf = queue.makeCommandBuffer()!
let library = try device.makeLibrary(source: metalProgram, options:
nil)
let function = library.makeFunction(name: "main_")!
let pipeline = try device.makeComputePipelineState(function: function)
let sharedBuf = MTLResourceOptions.storageModeShared
let buf0 = device.makeBuffer(length: 1024, options: sharedBuf)!
let buf1 = device.makeBuffer(length: 1024, options: sharedBuf)!
let encoder = cmdbuf.makeComputeCommandEncoder()!
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(buf0, offset: 0, index: 0)
encoder.setBuffer(buf1, offset: 0, index: 1)
let sizes: [UInt32] = [1024];
encoder.setBytes(sizes, length: 4, index: 2)
encoder.setThreadgroupMemoryLength(16, index: 0)
encoder.dispatchThreadgroups(MTLSizeMake(1, 1, 1), threadsPerThreadgroup: MTLSizeMake(256, 1, 1))
encoder.endEncoding()
cmdbuf.commit()
cmdbuf.waitUntilCompleted()
let contents = buf1.contents()
for i in 0..<4 {
print(i, contents.load(fromByteOffset: i * 4, as: UInt32.self))
}
} catch {
print("got an error")
}
}
barrierTest()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment