Last active
August 6, 2020 09:29
-
-
Save jrprice/3a0714933190dbe1a395 to your computer and use it in GitHub Desktop.
Comparison of trivial vector addition in Metal and OpenCL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import <Foundation/Foundation.h> | |
@import Metal; | |
#define ITRS 256 | |
#define N (16*1024*1024) | |
int main(int argc, const char * argv[]) { | |
@autoreleasepool { | |
// Create Metal device, queue and function | |
id<MTLDevice> device = MTLCreateSystemDefaultDevice(); | |
id<MTLCommandQueue> queue = [device newCommandQueue]; | |
id<MTLLibrary> library = [device newDefaultLibrary]; | |
id<MTLFunction> function = [library newFunctionWithName:@"vecadd"]; | |
NSLog(@"Device = %@\n", device.name); | |
// Create and initialize buffers | |
id<MTLBuffer> d_a = [device newBufferWithLength:N*4 options:0]; | |
id<MTLBuffer> d_b = [device newBufferWithLength:N*4 options:0]; | |
id<MTLBuffer> d_c = [device newBufferWithLength:N*4 options:0]; | |
float *h_a = [d_a contents]; | |
float *h_b = [d_b contents]; | |
for (int i = 0; i < N; i++) { | |
h_a[i] = i/(float)ITRS; | |
h_b[i] = i/(float)ITRS; | |
} | |
// Create command encoder/buffer and set compute pipeline state | |
NSError *errors; | |
id<MTLCommandBuffer> buffer = [queue commandBuffer]; | |
id<MTLComputeCommandEncoder> encoder = [buffer computeCommandEncoder]; | |
id<MTLComputePipelineState> state = [device newComputePipelineStateWithFunction:function error:&errors]; | |
[encoder setComputePipelineState:state]; | |
[encoder setBuffer:d_a offset:0 atIndex:0]; | |
[encoder setBuffer:d_b offset:0 atIndex:1]; | |
[encoder setBuffer:d_c offset:0 atIndex:2]; | |
// Start timing | |
NSDate *start = [NSDate date]; | |
// Dispatch compute commands | |
MTLSize groupsize = {64,1,1}; | |
MTLSize numgroups = {N/groupsize.width,1,1}; | |
for (int i = 0; i < ITRS; i++) { | |
[encoder dispatchThreadgroups:numgroups threadsPerThreadgroup:groupsize]; | |
} | |
[encoder endEncoding]; | |
[buffer commit]; | |
[buffer waitUntilCompleted]; | |
// Stop timing | |
NSTimeInterval elapsed = [start timeIntervalSinceNow]; | |
// Check a few results | |
float *h_c = [d_c contents]; | |
for (int i = 0; i < N; i+=1024*1024) { | |
if (h_c[i] != i*2) | |
NSLog(@"Error at position %d: %f vs %f\n", i, h_c[i], (float)(i*2)); | |
} | |
printf("Elapsed time = %lf\n", fabs(elapsed)); | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy | |
import pyopencl as CL | |
import time | |
ITRS = 256 | |
N = 16*1024*1024 | |
KERNEL = ''' | |
kernel void vecadd(const global float *a, | |
const global float *b, | |
global float *c) | |
{ | |
int i = get_global_id(0); | |
c[i] = c[i] + a[i] + b[i]; | |
} | |
''' | |
# Initialise OpenCL | |
context = CL.create_some_context() | |
queue = CL.CommandQueue(context) | |
program = CL.Program(context, KERNEL).build() | |
print 'Device = ' + context.devices[0].name | |
# Initialise host data | |
h_a = numpy.arange(0, N/float(ITRS), 1/float(ITRS)).astype(numpy.float32) | |
h_b = numpy.arange(0, N/float(ITRS), 1/float(ITRS)).astype(numpy.float32) | |
h_c = numpy.empty(N, numpy.float32) | |
# Initialise device data | |
RWCP = CL.mem_flags.READ_WRITE | CL.mem_flags.COPY_HOST_PTR | |
d_a = CL.Buffer(context, RWCP, hostbuf=h_a) | |
d_b = CL.Buffer(context, RWCP, hostbuf=h_b) | |
d_c = CL.Buffer(context, CL.mem_flags.WRITE_ONLY, size=N*4) | |
# Run kernel | |
start = time.time() | |
for i in range(ITRS): | |
program.vecadd(queue, (N,), (64,), d_a, d_b, d_c) | |
queue.finish() | |
end = time.time() | |
# Read results and check a few | |
CL.enqueue_copy(queue, h_c, d_c) | |
for i in range(0,N,1024*1024): | |
if h_c[i] != i*2: | |
print 'Error at position %d: %f vs %f' % (i, h_c[i], i*2) | |
print 'Total kernel time: %.2f ms' % ((end-start)*1000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment