Skip to content

Instantly share code, notes, and snippets.

@jrprice
Last active August 6, 2020 09:29
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jrprice/3a0714933190dbe1a395 to your computer and use it in GitHub Desktop.
Save jrprice/3a0714933190dbe1a395 to your computer and use it in GitHub Desktop.
Comparison of trivial vector addition in Metal and OpenCL
#include <metal_stdlib>
using namespace metal;
kernel void vecadd(const device float *a [[buffer(0)]],
const device float *b [[buffer(1)]],
device float *c [[buffer(2)]],
uint i [[thread_position_in_grid]])
{
c[i] = c[i] + a[i] + b[i];
}
#import <Foundation/Foundation.h>
@import Metal;
#define ITRS 256
#define N (16*1024*1024)
int main(int argc, const char * argv[]) {
@autoreleasepool {
// Create Metal device, queue and function
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
id<MTLCommandQueue> queue = [device newCommandQueue];
id<MTLLibrary> library = [device newDefaultLibrary];
id<MTLFunction> function = [library newFunctionWithName:@"vecadd"];
NSLog(@"Device = %@\n", device.name);
// Create and initialize buffers
id<MTLBuffer> d_a = [device newBufferWithLength:N*4 options:0];
id<MTLBuffer> d_b = [device newBufferWithLength:N*4 options:0];
id<MTLBuffer> d_c = [device newBufferWithLength:N*4 options:0];
float *h_a = [d_a contents];
float *h_b = [d_b contents];
for (int i = 0; i < N; i++) {
h_a[i] = i/(float)ITRS;
h_b[i] = i/(float)ITRS;
}
// Create command encoder/buffer and set compute pipeline state
NSError *errors;
id<MTLCommandBuffer> buffer = [queue commandBuffer];
id<MTLComputeCommandEncoder> encoder = [buffer computeCommandEncoder];
id<MTLComputePipelineState> state = [device newComputePipelineStateWithFunction:function error:&errors];
[encoder setComputePipelineState:state];
[encoder setBuffer:d_a offset:0 atIndex:0];
[encoder setBuffer:d_b offset:0 atIndex:1];
[encoder setBuffer:d_c offset:0 atIndex:2];
// Start timing
NSDate *start = [NSDate date];
// Dispatch compute commands
MTLSize groupsize = {64,1,1};
MTLSize numgroups = {N/groupsize.width,1,1};
for (int i = 0; i < ITRS; i++) {
[encoder dispatchThreadgroups:numgroups threadsPerThreadgroup:groupsize];
}
[encoder endEncoding];
[buffer commit];
[buffer waitUntilCompleted];
// Stop timing
NSTimeInterval elapsed = [start timeIntervalSinceNow];
// Check a few results
float *h_c = [d_c contents];
for (int i = 0; i < N; i+=1024*1024) {
if (h_c[i] != i*2)
NSLog(@"Error at position %d: %f vs %f\n", i, h_c[i], (float)(i*2));
}
printf("Elapsed time = %lf\n", fabs(elapsed));
}
return 0;
}
import numpy
import pyopencl as CL
import time
ITRS = 256
N = 16*1024*1024
KERNEL = '''
kernel void vecadd(const global float *a,
const global float *b,
global float *c)
{
int i = get_global_id(0);
c[i] = c[i] + a[i] + b[i];
}
'''
# Initialise OpenCL
context = CL.create_some_context()
queue = CL.CommandQueue(context)
program = CL.Program(context, KERNEL).build()
print 'Device = ' + context.devices[0].name
# Initialise host data
h_a = numpy.arange(0, N/float(ITRS), 1/float(ITRS)).astype(numpy.float32)
h_b = numpy.arange(0, N/float(ITRS), 1/float(ITRS)).astype(numpy.float32)
h_c = numpy.empty(N, numpy.float32)
# Initialise device data
RWCP = CL.mem_flags.READ_WRITE | CL.mem_flags.COPY_HOST_PTR
d_a = CL.Buffer(context, RWCP, hostbuf=h_a)
d_b = CL.Buffer(context, RWCP, hostbuf=h_b)
d_c = CL.Buffer(context, CL.mem_flags.WRITE_ONLY, size=N*4)
# Run kernel
start = time.time()
for i in range(ITRS):
program.vecadd(queue, (N,), (64,), d_a, d_b, d_c)
queue.finish()
end = time.time()
# Read results and check a few
CL.enqueue_copy(queue, h_c, d_c)
for i in range(0,N,1024*1024):
if h_c[i] != i*2:
print 'Error at position %d: %f vs %f' % (i, h_c[i], i*2)
print 'Total kernel time: %.2f ms' % ((end-start)*1000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment