Last active
September 5, 2023 16:42
-
-
Save pkhuong/d9a875d63ff53a76588f70855416e767 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% perf stat ./tsx_per_core | |
Aborts: 73381 | |
No reason: 57535 | |
reason 0: 0 | |
reason 1: 15833 | |
reason 2: 15833 | |
reason 3: 13 | |
reason 4: 0 | |
reason 5: 0 | |
81597398 68933303 85476712 72654716 76819140 72119787 77728188 74167801 75244800 72119632 75951739 72606078 74035109 70998737 89185152 58165171 86098994 83338241 88201758 81982602 77964219 77861602 75642915 79207346 73514808 79929374 74429694 71508632 0 0 0 0 | |
2147483648 | |
2147483648 | |
Performance counter stats for './tsx_per_core': | |
104039.835311 task-clock (msec) # 21.772 CPUs utilized | |
15,830 context-switches # 0.152 K/sec | |
232 cpu-migrations # 0.002 K/sec | |
516 page-faults # 0.005 K/sec | |
207,494,954,179 cycles # 1.994 GHz | |
0 stalled-cycles-frontend # 0.00% frontend cycles idle | |
0 stalled-cycles-backend # 0.00% backend cycles idle | |
28,541,848,264 instructions # 0.14 insns per cycle | |
8,698,290,000 branches # 83.605 M/sec | |
3,886,058 branch-misses # 0.04% of all branches | |
4.778610745 seconds time elapsed | |
-> ~96 cycles/increment on E5-2650L v4 (Broadwell). I get the same on one thread, so it looks | |
we're just paying for TSX overhead, without additional cache coherency noise. | |
One thread, w/o rdtscp: 66 cycles (fits with ~30 cycles latency on rdtscp). | |
So, not exactly *fast*, but reasonably efficient (~ 1 LLC cache miss) for non-trivial work | |
on core-local data structures. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <cpuid.h> | |
#include <inttypes.h> | |
#include <pthread.h> | |
#include <stddef.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <immintrin.h> | |
#define ARRAY_SIZE(X) (sizeof(X) / sizeof((X)[0])) | |
static const size_t niter = (1UL << 25); | |
struct counter { | |
uint64_t count; | |
} __attribute__((__aligned__(64))); | |
struct counter counters[32]; | |
static uint64_t abort_count = 0; | |
static uint64_t no_reason_count = 0; | |
static uint64_t reason_count[6]; | |
/* cpu_has_rtm lifted from https://github.com/andikleen/tsx-tools/ | |
* | |
* Copyright (c) 2012,2013 Intel Corporation | |
* Author: Andi Kleen | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that: (1) source code distributions | |
* retain the above copyright notice and this paragraph in its entirety, (2) | |
* distributions including binary code include the above copyright notice and | |
* this paragraph in its entirety in the documentation or other materials | |
* provided with the distribution | |
* | |
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED | |
* WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF | |
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. | |
*/ | |
#define CPUID_RTM (1 << 11) | |
static inline int cpu_has_rtm(void) | |
{ | |
if (__get_cpuid_max(0, NULL) >= 7) { | |
unsigned a, b, c, d; | |
__cpuid_count(7, 0, a, b, c, d); | |
return !!(b & CPUID_RTM); | |
} | |
return 0; | |
} | |
static inline uint8_t | |
get_core(uint64_t *OUT_ticks) | |
{ | |
uint32_t eax, ecx, edx; | |
__asm__("rdtscp" : "=a"(eax), "=c"(ecx), "=d"(edx)); | |
if (OUT_ticks != NULL) { | |
uint64_t ticks = edx; | |
ticks = (ticks << 32) | eax; | |
*OUT_ticks = ticks; | |
} | |
return (uint8_t)ecx; | |
} | |
void | |
test_percore(void) | |
{ | |
while (1) { | |
unsigned int reason; | |
reason = _xbegin(); | |
if (reason == _XBEGIN_STARTED) { | |
uint8_t cpu; | |
cpu = get_core(NULL); | |
counters[cpu % ARRAY_SIZE(counters)].count++; | |
_xend(); | |
return; | |
} | |
__sync_fetch_and_add(&abort_count, 1); | |
if (reason == 0) { | |
__sync_fetch_and_add(&no_reason_count, 1); | |
} | |
for (size_t i = 0; i < ARRAY_SIZE(reason_count); i++) { | |
if ((reason & (1UL << i)) != 0) { | |
__sync_fetch_and_add(&reason_count[i], 1); | |
} | |
} | |
} | |
return; | |
} | |
static void * | |
worker(void *arg) | |
{ | |
(void)arg; | |
for (size_t i = 0; i < niter; i++) { | |
test_percore(); | |
} | |
return NULL; | |
} | |
int | |
main() | |
{ | |
pthread_t threads[64]; /* overcommit # threads to force context switches. */ | |
uint64_t total_count = 0; | |
assert(cpu_has_rtm()); | |
for (size_t i = 0; i < ARRAY_SIZE(threads); i++) { | |
pthread_create(&threads[i], NULL, worker, NULL); | |
} | |
for (size_t i = 0; i < ARRAY_SIZE(threads); i++) { | |
pthread_join(threads[i], NULL); | |
} | |
printf("Aborts: %" PRIu64 "\n", abort_count); | |
printf("No reason: %" PRIu64 "\n", no_reason_count); | |
for (size_t i = 0; i < ARRAY_SIZE(reason_count); i++) { | |
printf("reason %zu: %" PRIu64 "\n", i, reason_count[i]); | |
} | |
for (size_t i = 0; i < ARRAY_SIZE(counters); i++) { | |
printf("%" PRIu64 " ", counters[i].count); | |
total_count += counters[i].count; | |
} | |
printf("\n%" PRIu64 "\n%zu\n", total_count, ARRAY_SIZE(threads) * niter); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment