Khrob/metal_compute.swift

## metal_compute.swift
///
/// This is a very simple command line application to work out how
/// to just run a compute program on the Metal GPU. It's not safe,
/// and will crash without warning if anything goes wrong, but I
/// was just trying to get the bare minimum thing working.
///
/// It's based off of Apple's sample code here:
/// https://developer.apple.com/documentation/metal/basic_tasks_and_concepts/performing_calculations_on_a_gpu
/// but I converted it from Objective-C to Swift and stripped out all the OO stuff.
///
/// Make sure you put this in your .metal file:
///
/**

    #include <metal_stdlib>
    using namespace metal;

    kernel
    void
    add_arrays (
               device const float * buffer_A,
               device const float * buffer_B,
               device float * result,
               uint index [[ thread_position_in_grid ]])
    {
        result[index] = buffer_A[index] + buffer_B[index];
    }

**/

import Metal

func calculate (a:[Float], b:[Float]) -> [Float]
{
    // Basic sanity check
    if a.count != b.count { fatalError() }
    let count = a.count

    // Create the Metal device & compute shaders, etc.
    let device = MTLCreateSystemDefaultDevice()!
    let library = device.makeDefaultLibrary()!
    let command_queue = device.makeCommandQueue()!
    let command_buffer = command_queue.makeCommandBuffer()!
    let compute_encoder = command_buffer.makeComputeCommandEncoder()!
    let add_function = library.makeFunction(name: "add_arrays")!
    let state = try! device.makeComputePipelineState(function: add_function)

    // Allocate memory for the compute shader to use, note that
    // buffer_result is set to .storageModeShared, so the CPU
    // can have access to the computed results.
    let buffer_A      = device.makeBuffer(bytes: a, length: MemoryLayout<Float>.stride*count, options: .storageModeShared)!
    let buffer_B      = device.makeBuffer(bytes: b, length: MemoryLayout<Float>.stride*count, options: .storageModeShared)!
    let buffer_result = device.makeBuffer(length: MemoryLayout<Float>.size*count, options: .storageModeShared)!

    compute_encoder.setBuffer(buffer_A, offset: 0, index: 0)
    compute_encoder.setBuffer(buffer_B, offset: 0, index: 1)
    compute_encoder.setBuffer(buffer_result, offset: 0, index: 2)

    // Work out how many worker threads we need to use.
    // (Pretty sure this affects how things are indexed inside the shader)
    let grid_size = MTLSizeMake(count, 1, 1)
    let thread_count = min(state.maxTotalThreadsPerThreadgroup, count)
    let thread_group_size = MTLSizeMake(thread_count, 1, 1)

    // Actually make the call
    compute_encoder.setComputePipelineState(state)
    compute_encoder.dispatchThreads(grid_size, threadsPerThreadgroup: thread_group_size)
    compute_encoder.endEncoding()
    command_buffer.commit()
    command_buffer.waitUntilCompleted()

    // Annoying conversion stuff
    let converted = buffer_result.contents().bindMemory(to: Float.self, capacity: count)
    let a = UnsafeBufferPointer(start: converted, count: count)
    return Array(a)
}


///
/// Dumbly putting it through its paces:
///

func create_big_buffer (_ size:Int) -> [Float]
{
    var buffer = [Float]()
    for _ in 0..<size { buffer.append(Float.random(in: 1.0..<100.0)) }
    return buffer
}

func test(_ count:Int)
{
    // Build some random float buffers
    let aa = create_big_buffer(count)
    let bb = create_big_buffer(count)

    // Super-rough timing (which probably doesn't really represent what's happening on the GPU)
    let start = Date()
    defer { print ("test:\(count) took \(-start.timeIntervalSinceNow)") }

    // Run the calculation
    _ = calculate(a:aa, b:bb)
}

func test_with_output()
{
    let a : [Float] = [1,2,3,4]
    let b : [Float] = [10,10,10,10]
    let r = calculate(a:a, b:b)

    print (a)
    print (b)
    print (r)
}

test_with_output()

test(100)
test(1000)
test(10000)
test(1000000)
test(10000000)

/// Sample output:
///
/// Metal Compute Test[20660:789358] Metal GPU Frame Capture Enabled
/// Metal Compute Test[20660:789358] Metal API Validation Enabled
/// [1.0, 2.0, 3.0, 4.0]
/// [10.0, 10.0, 10.0, 10.0]
/// [11.0, 12.0, 13.0, 14.0]
/// test:100 took 0.0023430585861206055
/// test:1000 took 0.0018529891967773438
/// test:10000 took 0.001916050910949707
/// test:1000000 took 0.012307047843933105
/// test:10000000 took 0.10860800743103027
/// Program ended with exit code: 0
	///
	/// This is a very simple command line application to work out how
	/// to just run a compute program on the Metal GPU. It's not safe,
	/// and will crash without warning if anything goes wrong, but I
	/// was just trying to get the bare minimum thing working.
	///
	/// It's based off of Apple's sample code here:
	/// https://developer.apple.com/documentation/metal/basic_tasks_and_concepts/performing_calculations_on_a_gpu
	/// but I converted it from Objective-C to Swift and stripped out all the OO stuff.
	///
	/// Make sure you put this in your .metal file:
	///
	/**

	#include <metal_stdlib>
	using namespace metal;

	kernel
	void
	add_arrays (
	device const float * buffer_A,
	device const float * buffer_B,
	device float * result,
	uint index [[ thread_position_in_grid ]])
	{
	result[index] = buffer_A[index] + buffer_B[index];
	}

	**/

	import Metal

	func calculate (a:[Float], b:[Float]) -> [Float]
	{
	// Basic sanity check
	if a.count != b.count { fatalError() }
	let count = a.count

	// Create the Metal device & compute shaders, etc.
	let device = MTLCreateSystemDefaultDevice()!
	let library = device.makeDefaultLibrary()!
	let command_queue = device.makeCommandQueue()!
	let command_buffer = command_queue.makeCommandBuffer()!
	let compute_encoder = command_buffer.makeComputeCommandEncoder()!
	let add_function = library.makeFunction(name: "add_arrays")!
	let state = try! device.makeComputePipelineState(function: add_function)

	// Allocate memory for the compute shader to use, note that
	// buffer_result is set to .storageModeShared, so the CPU
	// can have access to the computed results.
	let buffer_A = device.makeBuffer(bytes: a, length: MemoryLayout<Float>.stride*count, options: .storageModeShared)!
	let buffer_B = device.makeBuffer(bytes: b, length: MemoryLayout<Float>.stride*count, options: .storageModeShared)!
	let buffer_result = device.makeBuffer(length: MemoryLayout<Float>.size*count, options: .storageModeShared)!

	compute_encoder.setBuffer(buffer_A, offset: 0, index: 0)
	compute_encoder.setBuffer(buffer_B, offset: 0, index: 1)
	compute_encoder.setBuffer(buffer_result, offset: 0, index: 2)

	// Work out how many worker threads we need to use.
	// (Pretty sure this affects how things are indexed inside the shader)
	let grid_size = MTLSizeMake(count, 1, 1)
	let thread_count = min(state.maxTotalThreadsPerThreadgroup, count)
	let thread_group_size = MTLSizeMake(thread_count, 1, 1)

	// Actually make the call
	compute_encoder.setComputePipelineState(state)
	compute_encoder.dispatchThreads(grid_size, threadsPerThreadgroup: thread_group_size)
	compute_encoder.endEncoding()
	command_buffer.commit()
	command_buffer.waitUntilCompleted()

	// Annoying conversion stuff
	let converted = buffer_result.contents().bindMemory(to: Float.self, capacity: count)
	let a = UnsafeBufferPointer(start: converted, count: count)
	return Array(a)
	}




	///
	/// Dumbly putting it through its paces:
	///

	func create_big_buffer (_ size:Int) -> [Float]
	{
	var buffer = [Float]()
	for _ in 0..<size { buffer.append(Float.random(in: 1.0..<100.0)) }
	return buffer
	}

	func test(_ count:Int)
	{
	// Build some random float buffers
	let aa = create_big_buffer(count)
	let bb = create_big_buffer(count)

	// Super-rough timing (which probably doesn't really represent what's happening on the GPU)
	let start = Date()
	defer { print ("test:\(count) took \(-start.timeIntervalSinceNow)") }

	// Run the calculation
	_ = calculate(a:aa, b:bb)
	}

	func test_with_output()
	{
	let a : [Float] = [1,2,3,4]
	let b : [Float] = [10,10,10,10]
	let r = calculate(a:a, b:b)

	print (a)
	print (b)
	print (r)
	}

	test_with_output()

	test(100)
	test(1000)
	test(10000)
	test(1000000)
	test(10000000)

	/// Sample output:
	///
	/// Metal Compute Test[20660:789358] Metal GPU Frame Capture Enabled
	/// Metal Compute Test[20660:789358] Metal API Validation Enabled
	/// [1.0, 2.0, 3.0, 4.0]
	/// [10.0, 10.0, 10.0, 10.0]
	/// [11.0, 12.0, 13.0, 14.0]
	/// test:100 took 0.0023430585861206055
	/// test:1000 took 0.0018529891967773438
	/// test:10000 took 0.001916050910949707
	/// test:1000000 took 0.012307047843933105
	/// test:10000000 took 0.10860800743103027
	/// Program ended with exit code: 0