prufrock/MetalAdder.swift

## MetalAdder.swift
import PlaygroundSupport
import MetalKit

// The length in Apple's example but it takes too long to build 2 buffers of that length
//let arrayLength: Int = 1 << 24
let arrayLength: Int = 1 << 20
let bufferSize = arrayLength * MemoryLayout<Float>.stride

class MetalAdder {
    var device: MTLDevice

    // The compute pipeline generated from the compute kernel in the .metal shader file.
    var addFunctionPSO: MTLComputePipelineState

    // The command queue used to pass commands to the device.
    var commandQueue: MTLCommandQueue

    //Buffers to hold data
    var mBufferA: MTLBuffer?
    var mBufferB: MTLBuffer?
    var mBufferResult: MTLBuffer?

    init(withDevice: MTLDevice) throws {
        device = withDevice

        let shader = """
        #include <metal_stdlib>
        using namespace metal;
        /// This is a Metal Shading Language (MSL) function equivalent to the add_arrays() C function, used to perform the calculation on a GPU.
        kernel void add_arrays(device const float* inA,
                               device const float* inB,
                               device float* result,
                               uint index [[thread_position_in_grid]])
        {
            // the for-loop is replaced with a collection of threads, each of which
            // calls this function.
            result[index] = inA[index] + inB[index];
        }
        """

        let library = try device.makeLibrary(source: shader, options: nil)

        guard let addFunction = library.makeFunction(name: "add_arrays") else {
            fatalError("Could not create the adder function")
        }

        // Create a compute pipeline state object
        try addFunctionPSO = device.makeComputePipelineState(function: addFunction)


        guard let _commandQueue = device.makeCommandQueue() else {
            fatalError("Could not create command queue")
        }

        commandQueue = _commandQueue
    }

    func prepareData() {
        // Allocate three buffers to hold our initial data and the result.
        mBufferA = device.makeBuffer(length: bufferSize, options: .storageModeShared)
        mBufferB = device.makeBuffer(length: bufferSize, options: .storageModeShared)
        mBufferResult = device.makeBuffer(length: bufferSize, options: .storageModeShared)

        generateRandomFloatData(buffer: mBufferA!)
        generateRandomFloatData(buffer: mBufferB!)
    }

    func sendComputeCommand() {
        // Create a command buffer to hold commands.
        guard let commandBuffer = commandQueue.makeCommandBuffer() else {
            fatalError("Could not create command buffer")
        }

        // Start a compute pass.
        guard let computeEncoder = commandBuffer.makeComputeCommandEncoder() else {
            fatalError("Could not create compute encoder")
        }

        encodeAddCommand(computeEncoder: computeEncoder)

        //End the computer pass
        commandBuffer.commit()

        let startTime = CFAbsoluteTimeGetCurrent()
        // Block until calcuation is complete
        commandBuffer.waitUntilCompleted()
        let timeElapsed = CFAbsoluteTimeGetCurrent() - startTime
        print("Time elapsed for GPU calculation: \(timeElapsed) s.")

        verifyResults()
    }

    func encodeAddCommand(computeEncoder:MTLComputeCommandEncoder) {

        // Encode the pipeline state object and it's parameters
        computeEncoder.setComputePipelineState(addFunctionPSO)
        computeEncoder.setBuffer(mBufferA, offset: 0, index: 0)
        computeEncoder.setBuffer(mBufferB, offset: 0, index: 1)
        computeEncoder.setBuffer(mBufferResult, offset: 0, index: 2)

        let gridSize = MTLSizeMake(arrayLength, 1, 1)

        // Calculate a threadgroup size
        var threadGroupSize = addFunctionPSO.maxTotalThreadsPerThreadgroup
        if (threadGroupSize > arrayLength) {
            threadGroupSize = arrayLength
        }
        let threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1)

        // Encode the compute command.
        computeEncoder.dispatchThreads(gridSize, threadsPerThreadgroup: threadgroupSize)

        computeEncoder.endEncoding()
    }

    func generateRandomFloatData(buffer:MTLBuffer) {
        let dataPtr = buffer.contents().assumingMemoryBound(to: Float.self)

        for index in 0...arrayLength {
            dataPtr[Int(index)] = Float(arc4random())/Float(RAND_MAX)
        }
    }

    func verifyResults() {
        let a = mBufferA!.contents().assumingMemoryBound(to: Float.self)
        let b = mBufferB!.contents().assumingMemoryBound(to: Float.self)
        let result = mBufferResult!.contents().assumingMemoryBound(to: Float.self)

        let startTime = CFAbsoluteTimeGetCurrent()
        for index in 0..<arrayLength {
            let total = a[index] + b[index]
            if(result[index] != total) {
                print("Compute error index=\(index) result=\(result[index]) \(total)=a+b")
                assert(result[index] != total)
            }
        }
        let timeElapsed = CFAbsoluteTimeGetCurrent() - startTime
        print("Time elapsed for CPU calculation: \(timeElapsed) s.")
        print("Compute results as expected")
    }
}

guard let device = MTLCreateSystemDefaultDevice() else {
  fatalError("GPU is not supported")
}

let adder = try MetalAdder(withDevice:device)

adder.prepareData()

adder.sendComputeCommand()
	import PlaygroundSupport
	import MetalKit

	// The length in Apple's example but it takes too long to build 2 buffers of that length
	//let arrayLength: Int = 1 << 24
	let arrayLength: Int = 1 << 20
	let bufferSize = arrayLength * MemoryLayout<Float>.stride

	class MetalAdder {
	var device: MTLDevice

	// The compute pipeline generated from the compute kernel in the .metal shader file.
	var addFunctionPSO: MTLComputePipelineState

	// The command queue used to pass commands to the device.
	var commandQueue: MTLCommandQueue

	//Buffers to hold data
	var mBufferA: MTLBuffer?
	var mBufferB: MTLBuffer?
	var mBufferResult: MTLBuffer?

	init(withDevice: MTLDevice) throws {
	device = withDevice

	let shader = """
	#include <metal_stdlib>
	using namespace metal;
	/// This is a Metal Shading Language (MSL) function equivalent to the add_arrays() C function, used to perform the calculation on a GPU.
	kernel void add_arrays(device const float* inA,
	device const float* inB,
	device float* result,
	uint index [[thread_position_in_grid]])
	{
	// the for-loop is replaced with a collection of threads, each of which
	// calls this function.
	result[index] = inA[index] + inB[index];
	}
	"""

	let library = try device.makeLibrary(source: shader, options: nil)

	guard let addFunction = library.makeFunction(name: "add_arrays") else {
	fatalError("Could not create the adder function")
	}

	// Create a compute pipeline state object
	try addFunctionPSO = device.makeComputePipelineState(function: addFunction)


	guard let _commandQueue = device.makeCommandQueue() else {
	fatalError("Could not create command queue")
	}

	commandQueue = _commandQueue
	}

	func prepareData() {
	// Allocate three buffers to hold our initial data and the result.
	mBufferA = device.makeBuffer(length: bufferSize, options: .storageModeShared)
	mBufferB = device.makeBuffer(length: bufferSize, options: .storageModeShared)
	mBufferResult = device.makeBuffer(length: bufferSize, options: .storageModeShared)

	generateRandomFloatData(buffer: mBufferA!)
	generateRandomFloatData(buffer: mBufferB!)
	}

	func sendComputeCommand() {
	// Create a command buffer to hold commands.
	guard let commandBuffer = commandQueue.makeCommandBuffer() else {
	fatalError("Could not create command buffer")
	}

	// Start a compute pass.
	guard let computeEncoder = commandBuffer.makeComputeCommandEncoder() else {
	fatalError("Could not create compute encoder")
	}

	encodeAddCommand(computeEncoder: computeEncoder)

	//End the computer pass
	commandBuffer.commit()

	let startTime = CFAbsoluteTimeGetCurrent()
	// Block until calcuation is complete
	commandBuffer.waitUntilCompleted()
	let timeElapsed = CFAbsoluteTimeGetCurrent() - startTime
	print("Time elapsed for GPU calculation: \(timeElapsed) s.")

	verifyResults()
	}

	func encodeAddCommand(computeEncoder:MTLComputeCommandEncoder) {

	// Encode the pipeline state object and it's parameters
	computeEncoder.setComputePipelineState(addFunctionPSO)
	computeEncoder.setBuffer(mBufferA, offset: 0, index: 0)
	computeEncoder.setBuffer(mBufferB, offset: 0, index: 1)
	computeEncoder.setBuffer(mBufferResult, offset: 0, index: 2)

	let gridSize = MTLSizeMake(arrayLength, 1, 1)

	// Calculate a threadgroup size
	var threadGroupSize = addFunctionPSO.maxTotalThreadsPerThreadgroup
	if (threadGroupSize > arrayLength) {
	threadGroupSize = arrayLength
	}
	let threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1)

	// Encode the compute command.
	computeEncoder.dispatchThreads(gridSize, threadsPerThreadgroup: threadgroupSize)

	computeEncoder.endEncoding()
	}

	func generateRandomFloatData(buffer:MTLBuffer) {
	let dataPtr = buffer.contents().assumingMemoryBound(to: Float.self)

	for index in 0...arrayLength {
	dataPtr[Int(index)] = Float(arc4random())/Float(RAND_MAX)
	}
	}

	func verifyResults() {
	let a = mBufferA!.contents().assumingMemoryBound(to: Float.self)
	let b = mBufferB!.contents().assumingMemoryBound(to: Float.self)
	let result = mBufferResult!.contents().assumingMemoryBound(to: Float.self)

	let startTime = CFAbsoluteTimeGetCurrent()
	for index in 0..<arrayLength {
	let total = a[index] + b[index]
	if(result[index] != total) {
	print("Compute error index=\(index) result=\(result[index]) \(total)=a+b")
	assert(result[index] != total)
	}
	}
	let timeElapsed = CFAbsoluteTimeGetCurrent() - startTime
	print("Time elapsed for CPU calculation: \(timeElapsed) s.")
	print("Compute results as expected")
	}
	}

	guard let device = MTLCreateSystemDefaultDevice() else {
	fatalError("GPU is not supported")
	}

	let adder = try MetalAdder(withDevice:device)

	adder.prepareData()

	adder.sendComputeCommand()