Benchmark showing how locks sharing a cache line will contend with each other
#import <Foundation/Foundation.h> | |
#import <time.h> | |
#import <os/lock.h> | |
#define ITERS 2000 | |
#define NSEC_PER_ITER(time) (((double)time * (double)NSEC_PER_SEC) / (double)ITERS) | |
#define TEST(body, name) do {\ | |
start = [NSDate date];\ | |
for (int i = 0; i < ITERS; i++) {\ | |
body\ | |
}\ | |
elapsed = -[start timeIntervalSinceNow];\ | |
if (baseline == 0) printf("Baseline of %f nanoseconds to repeatedly lock-unlock 16 striped unfair locks\n", NSEC_PER_ITER(elapsed));\ | |
else printf("It was %.2fx as fast to use %s \n", baseline / elapsed, name);\ | |
} while (0) | |
static char unusedBuffer1[16384] = { 1 }; | |
static os_unfair_lock packed[16] __attribute__((aligned(64))) = { OS_UNFAIR_LOCK_INIT }; | |
static char unusedBuffer2[16384] = { 1 }; | |
static char unusedBuffer3[16384] = { 1 }; | |
static os_unfair_lock spread[16 * 16] __attribute__((aligned(64))) = { OS_UNFAIR_LOCK_INIT }; | |
static char unusedBuffer4[16384] = { 1 }; | |
int main() { | |
NSDate *start = nil; | |
NSTimeInterval elapsed = 0; | |
NSTimeInterval baseline = 0; | |
void (^packedTest)() = ^{ | |
dispatch_apply(16, dispatch_get_global_queue(0,0), ^(size_t idx) { | |
for (int i = 0; i < 1000; i++) { | |
os_unfair_lock_lock(&packed[idx]); | |
os_unfair_lock_unlock(&packed[idx]); | |
} | |
}); | |
}; | |
TEST(packedTest();, "packed"); | |
baseline = elapsed; | |
void (^spreadTest)() = ^{ | |
dispatch_apply(16, dispatch_get_global_queue(0,0), ^(size_t idx) { | |
for (int i = 0; i < 1000; i++) { | |
//only use 1 lock out of every 16 slots. 4 bytes * 16 is 64 bytes, which is the size of a cacheline | |
os_unfair_lock_lock(&spread[idx * 16]); | |
os_unfair_lock_unlock(&spread[idx * 16]); | |
} | |
}); | |
}; | |
TEST(spreadTest();, "spread locks"); //On my laptop, this is ~5.5x as fast as the previous one, despite doing "the same" work | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment