Skip to content

Instantly share code, notes, and snippets.

@pkhuong
Last active September 5, 2023 16:42
Show Gist options
  • Save pkhuong/d9a875d63ff53a76588f70855416e767 to your computer and use it in GitHub Desktop.
Save pkhuong/d9a875d63ff53a76588f70855416e767 to your computer and use it in GitHub Desktop.
% perf stat ./tsx_per_core
Aborts: 73381
No reason: 57535
reason 0: 0
reason 1: 15833
reason 2: 15833
reason 3: 13
reason 4: 0
reason 5: 0
81597398 68933303 85476712 72654716 76819140 72119787 77728188 74167801 75244800 72119632 75951739 72606078 74035109 70998737 89185152 58165171 86098994 83338241 88201758 81982602 77964219 77861602 75642915 79207346 73514808 79929374 74429694 71508632 0 0 0 0
2147483648
2147483648
Performance counter stats for './tsx_per_core':
104039.835311 task-clock (msec) # 21.772 CPUs utilized
15,830 context-switches # 0.152 K/sec
232 cpu-migrations # 0.002 K/sec
516 page-faults # 0.005 K/sec
207,494,954,179 cycles # 1.994 GHz
0 stalled-cycles-frontend # 0.00% frontend cycles idle
0 stalled-cycles-backend # 0.00% backend cycles idle
28,541,848,264 instructions # 0.14 insns per cycle
8,698,290,000 branches # 83.605 M/sec
3,886,058 branch-misses # 0.04% of all branches
4.778610745 seconds time elapsed
-> ~96 cycles/increment on E5-2650L v4 (Broadwell). I get the same on one thread, so it looks
we're just paying for TSX overhead, without additional cache coherency noise.
One thread, w/o rdtscp: 66 cycles (fits with ~30 cycles latency on rdtscp).
So, not exactly *fast*, but reasonably efficient (~ 1 LLC cache miss) for non-trivial work
on core-local data structures.
#include <assert.h>
#include <cpuid.h>
#include <inttypes.h>
#include <pthread.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <immintrin.h>
#define ARRAY_SIZE(X) (sizeof(X) / sizeof((X)[0]))
static const size_t niter = (1UL << 25);
struct counter {
uint64_t count;
} __attribute__((__aligned__(64)));
struct counter counters[32];
static uint64_t abort_count = 0;
static uint64_t no_reason_count = 0;
static uint64_t reason_count[6];
/* cpu_has_rtm lifted from https://github.com/andikleen/tsx-tools/
*
* Copyright (c) 2012,2013 Intel Corporation
* Author: Andi Kleen
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that: (1) source code distributions
* retain the above copyright notice and this paragraph in its entirety, (2)
* distributions including binary code include the above copyright notice and
* this paragraph in its entirety in the documentation or other materials
* provided with the distribution
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*/
#define CPUID_RTM (1 << 11)
static inline int cpu_has_rtm(void)
{
if (__get_cpuid_max(0, NULL) >= 7) {
unsigned a, b, c, d;
__cpuid_count(7, 0, a, b, c, d);
return !!(b & CPUID_RTM);
}
return 0;
}
static inline uint8_t
get_core(uint64_t *OUT_ticks)
{
uint32_t eax, ecx, edx;
__asm__("rdtscp" : "=a"(eax), "=c"(ecx), "=d"(edx));
if (OUT_ticks != NULL) {
uint64_t ticks = edx;
ticks = (ticks << 32) | eax;
*OUT_ticks = ticks;
}
return (uint8_t)ecx;
}
void
test_percore(void)
{
while (1) {
unsigned int reason;
reason = _xbegin();
if (reason == _XBEGIN_STARTED) {
uint8_t cpu;
cpu = get_core(NULL);
counters[cpu % ARRAY_SIZE(counters)].count++;
_xend();
return;
}
__sync_fetch_and_add(&abort_count, 1);
if (reason == 0) {
__sync_fetch_and_add(&no_reason_count, 1);
}
for (size_t i = 0; i < ARRAY_SIZE(reason_count); i++) {
if ((reason & (1UL << i)) != 0) {
__sync_fetch_and_add(&reason_count[i], 1);
}
}
}
return;
}
static void *
worker(void *arg)
{
(void)arg;
for (size_t i = 0; i < niter; i++) {
test_percore();
}
return NULL;
}
int
main()
{
pthread_t threads[64]; /* overcommit # threads to force context switches. */
uint64_t total_count = 0;
assert(cpu_has_rtm());
for (size_t i = 0; i < ARRAY_SIZE(threads); i++) {
pthread_create(&threads[i], NULL, worker, NULL);
}
for (size_t i = 0; i < ARRAY_SIZE(threads); i++) {
pthread_join(threads[i], NULL);
}
printf("Aborts: %" PRIu64 "\n", abort_count);
printf("No reason: %" PRIu64 "\n", no_reason_count);
for (size_t i = 0; i < ARRAY_SIZE(reason_count); i++) {
printf("reason %zu: %" PRIu64 "\n", i, reason_count[i]);
}
for (size_t i = 0; i < ARRAY_SIZE(counters); i++) {
printf("%" PRIu64 " ", counters[i].count);
total_count += counters[i].count;
}
printf("\n%" PRIu64 "\n%zu\n", total_count, ARRAY_SIZE(threads) * niter);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment