Skip to content

Instantly share code, notes, and snippets.

@Khrob
Last active May 6, 2021
Embed
What would you like to do?
///
/// This is a very simple command line application to work out how
/// to just run a compute program on the Metal GPU. It's not safe,
/// and will crash without warning if anything goes wrong, but I
/// was just trying to get the bare minimum thing working.
///
/// It's based off of Apple's sample code here:
/// https://developer.apple.com/documentation/metal/basic_tasks_and_concepts/performing_calculations_on_a_gpu
/// but I converted it from Objective-C to Swift and stripped out all the OO stuff.
///
/// Make sure you put this in your .metal file:
///
/**
#include <metal_stdlib>
using namespace metal;
kernel
void
add_arrays (
device const float * buffer_A,
device const float * buffer_B,
device float * result,
uint index [[ thread_position_in_grid ]])
{
result[index] = buffer_A[index] + buffer_B[index];
}
**/
import Metal
func calculate (a:[Float], b:[Float]) -> [Float]
{
// Basic sanity check
if a.count != b.count { fatalError() }
let count = a.count
// Create the Metal device & compute shaders, etc.
let device = MTLCreateSystemDefaultDevice()!
let library = device.makeDefaultLibrary()!
let command_queue = device.makeCommandQueue()!
let command_buffer = command_queue.makeCommandBuffer()!
let compute_encoder = command_buffer.makeComputeCommandEncoder()!
let add_function = library.makeFunction(name: "add_arrays")!
let state = try! device.makeComputePipelineState(function: add_function)
// Allocate memory for the compute shader to use, note that
// buffer_result is set to .storageModeShared, so the CPU
// can have access to the computed results.
let buffer_A = device.makeBuffer(bytes: a, length: MemoryLayout<Float>.stride*count, options: .storageModeShared)!
let buffer_B = device.makeBuffer(bytes: b, length: MemoryLayout<Float>.stride*count, options: .storageModeShared)!
let buffer_result = device.makeBuffer(length: MemoryLayout<Float>.size*count, options: .storageModeShared)!
compute_encoder.setBuffer(buffer_A, offset: 0, index: 0)
compute_encoder.setBuffer(buffer_B, offset: 0, index: 1)
compute_encoder.setBuffer(buffer_result, offset: 0, index: 2)
// Work out how many worker threads we need to use.
// (Pretty sure this affects how things are indexed inside the shader)
let grid_size = MTLSizeMake(count, 1, 1)
let thread_count = min(state.maxTotalThreadsPerThreadgroup, count)
let thread_group_size = MTLSizeMake(thread_count, 1, 1)
// Actually make the call
compute_encoder.setComputePipelineState(state)
compute_encoder.dispatchThreads(grid_size, threadsPerThreadgroup: thread_group_size)
compute_encoder.endEncoding()
command_buffer.commit()
command_buffer.waitUntilCompleted()
// Annoying conversion stuff
let converted = buffer_result.contents().bindMemory(to: Float.self, capacity: count)
let a = UnsafeBufferPointer(start: converted, count: count)
return Array(a)
}
///
/// Dumbly putting it through its paces:
///
func create_big_buffer (_ size:Int) -> [Float]
{
var buffer = [Float]()
for _ in 0..<size { buffer.append(Float.random(in: 1.0..<100.0)) }
return buffer
}
func test(_ count:Int)
{
// Build some random float buffers
let aa = create_big_buffer(count)
let bb = create_big_buffer(count)
// Super-rough timing (which probably doesn't really represent what's happening on the GPU)
let start = Date()
defer { print ("test:\(count) took \(-start.timeIntervalSinceNow)") }
// Run the calculation
_ = calculate(a:aa, b:bb)
}
func test_with_output()
{
let a : [Float] = [1,2,3,4]
let b : [Float] = [10,10,10,10]
let r = calculate(a:a, b:b)
print (a)
print (b)
print (r)
}
test_with_output()
test(100)
test(1000)
test(10000)
test(1000000)
test(10000000)
/// Sample output:
///
/// Metal Compute Test[20660:789358] Metal GPU Frame Capture Enabled
/// Metal Compute Test[20660:789358] Metal API Validation Enabled
/// [1.0, 2.0, 3.0, 4.0]
/// [10.0, 10.0, 10.0, 10.0]
/// [11.0, 12.0, 13.0, 14.0]
/// test:100 took 0.0023430585861206055
/// test:1000 took 0.0018529891967773438
/// test:10000 took 0.001916050910949707
/// test:1000000 took 0.012307047843933105
/// test:10000000 took 0.10860800743103027
/// Program ended with exit code: 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment