This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// | |
/// This is a very simple command line application to work out how | |
/// to just run a compute program on the Metal GPU. It's not safe, | |
/// and will crash without warning if anything goes wrong, but I | |
/// was just trying to get the bare minimum thing working. | |
/// | |
/// It's based off of Apple's sample code here: | |
/// https://developer.apple.com/documentation/metal/basic_tasks_and_concepts/performing_calculations_on_a_gpu | |
/// but I converted it from Objective-C to Swift and stripped out all the OO stuff. | |
/// | |
/// Make sure you put this in your .metal file: | |
/// | |
/** | |
#include <metal_stdlib> | |
using namespace metal; | |
kernel | |
void | |
add_arrays ( | |
device const float * buffer_A, | |
device const float * buffer_B, | |
device float * result, | |
uint index [[ thread_position_in_grid ]]) | |
{ | |
result[index] = buffer_A[index] + buffer_B[index]; | |
} | |
**/ | |
import Metal | |
func calculate (a:[Float], b:[Float]) -> [Float] | |
{ | |
// Basic sanity check | |
if a.count != b.count { fatalError() } | |
let count = a.count | |
// Create the Metal device & compute shaders, etc. | |
let device = MTLCreateSystemDefaultDevice()! | |
let library = device.makeDefaultLibrary()! | |
let command_queue = device.makeCommandQueue()! | |
let command_buffer = command_queue.makeCommandBuffer()! | |
let compute_encoder = command_buffer.makeComputeCommandEncoder()! | |
let add_function = library.makeFunction(name: "add_arrays")! | |
let state = try! device.makeComputePipelineState(function: add_function) | |
// Allocate memory for the compute shader to use, note that | |
// buffer_result is set to .storageModeShared, so the CPU | |
// can have access to the computed results. | |
let buffer_A = device.makeBuffer(bytes: a, length: MemoryLayout<Float>.stride*count, options: .storageModeShared)! | |
let buffer_B = device.makeBuffer(bytes: b, length: MemoryLayout<Float>.stride*count, options: .storageModeShared)! | |
let buffer_result = device.makeBuffer(length: MemoryLayout<Float>.size*count, options: .storageModeShared)! | |
compute_encoder.setBuffer(buffer_A, offset: 0, index: 0) | |
compute_encoder.setBuffer(buffer_B, offset: 0, index: 1) | |
compute_encoder.setBuffer(buffer_result, offset: 0, index: 2) | |
// Work out how many worker threads we need to use. | |
// (Pretty sure this affects how things are indexed inside the shader) | |
let grid_size = MTLSizeMake(count, 1, 1) | |
let thread_count = min(state.maxTotalThreadsPerThreadgroup, count) | |
let thread_group_size = MTLSizeMake(thread_count, 1, 1) | |
// Actually make the call | |
compute_encoder.setComputePipelineState(state) | |
compute_encoder.dispatchThreads(grid_size, threadsPerThreadgroup: thread_group_size) | |
compute_encoder.endEncoding() | |
command_buffer.commit() | |
command_buffer.waitUntilCompleted() | |
// Annoying conversion stuff | |
let converted = buffer_result.contents().bindMemory(to: Float.self, capacity: count) | |
let a = UnsafeBufferPointer(start: converted, count: count) | |
return Array(a) | |
} | |
/// | |
/// Dumbly putting it through its paces: | |
/// | |
func create_big_buffer (_ size:Int) -> [Float] | |
{ | |
var buffer = [Float]() | |
for _ in 0..<size { buffer.append(Float.random(in: 1.0..<100.0)) } | |
return buffer | |
} | |
func test(_ count:Int) | |
{ | |
// Build some random float buffers | |
let aa = create_big_buffer(count) | |
let bb = create_big_buffer(count) | |
// Super-rough timing (which probably doesn't really represent what's happening on the GPU) | |
let start = Date() | |
defer { print ("test:\(count) took \(-start.timeIntervalSinceNow)") } | |
// Run the calculation | |
_ = calculate(a:aa, b:bb) | |
} | |
func test_with_output() | |
{ | |
let a : [Float] = [1,2,3,4] | |
let b : [Float] = [10,10,10,10] | |
let r = calculate(a:a, b:b) | |
print (a) | |
print (b) | |
print (r) | |
} | |
test_with_output() | |
test(100) | |
test(1000) | |
test(10000) | |
test(1000000) | |
test(10000000) | |
/// Sample output: | |
/// | |
/// Metal Compute Test[20660:789358] Metal GPU Frame Capture Enabled | |
/// Metal Compute Test[20660:789358] Metal API Validation Enabled | |
/// [1.0, 2.0, 3.0, 4.0] | |
/// [10.0, 10.0, 10.0, 10.0] | |
/// [11.0, 12.0, 13.0, 14.0] | |
/// test:100 took 0.0023430585861206055 | |
/// test:1000 took 0.0018529891967773438 | |
/// test:10000 took 0.001916050910949707 | |
/// test:1000000 took 0.012307047843933105 | |
/// test:10000000 took 0.10860800743103027 | |
/// Program ended with exit code: 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment