Skip to content

Instantly share code, notes, and snippets.

@pkhuong
Last active October 18, 2022 09:01
Show Gist options
  • Save pkhuong/1ce34e33c6df4b9be3bc9beb22415a47 to your computer and use it in GitHub Desktop.
Save pkhuong/1ce34e33c6df4b9be3bc9beb22415a47 to your computer and use it in GitHub Desktop.
minimal BTS tracing wrapper for linux perf
#define RUN_ME /*
exec cc -O2 -W -Wall -std=c99 -shared $0 -o "$(basename $0 .c).so" -fPIC
*/
/*
* Copyright 2019 Paul Khuong
* SPDX-License-Identifier: BSD-2-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define _GNU_SOURCE
#include <errno.h>
#include <linux/perf_event.h>
#include <pthread.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
/*
* Public interface.
*/
struct bts_aux_record {
uint64_t from; /* from and to are instruction addresses. */
uint64_t to;
uint64_t flags; /* 0x10 = predicted, in theory, maybe. */
};
/*
* This function must be called with the value in
* `/sys/bus/event_source/devices/intel_bts/type` before calling
* `bts_setup`.
*/
void bts_init(int detected_bts_perf_type);
/*
* Cleans up any BTS state for the current thread.
*/
void bts_teardown(void);
/*
* Overwrites or creates the BTS state for the current thread, with
* an auxiliary (tracing) buffer of `aux_size` bytes. `aux_bytes`
* must be a power of two and must be at least one page.
*
* Returns 0 on success, negative on failure.
*/
int bts_setup(size_t aux_size);
/*
* Enables branch tracing for the calling thread, which must have
* a BTS state (i.e., only call `bts_start` after `bts_setup`).
*
* Returns 0 on success, negative on failure.
*/
int bts_start(void);
/*
* Stops branch tracing for the current thread, and returns a
* temporary (thread-local) buffer of the BTS records since
* the last call to `bts_start`.
*
* The first argument is overwritten with the number of valid
* records in the return value, or a negative count on error.
*
* When `(*OUT_num_elements + 2) * sizeof(struct bts_aux_record)`
* exceeds the `aux_size` passed to `bts_setup`, tracing may have
* exhausted the buffer space and stopped early. This trace
* truncation does not affect the execution of the traced program.
*/
const struct bts_aux_record *bts_stop(ssize_t *OUT_num_elements);
/*
* Actual implementation follows.
*/
/* Thread-local BTS state. */
struct bts_recording_state {
int bts_fd;
struct perf_event_mmap_page *perf;
void *aux_buffer;
/*
* This buffer has the same size as aux_buffer, and holds the
* aux records in linear order.
*/
struct bts_aux_record *linear_result;
};
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096ULL
#endif
/* Map in the perf control page, and no data ring buffer page. */
static const size_t perf_page_size = PAGE_SIZE;
static int bts_perf_type = -1;
static __thread struct bts_recording_state thread_state = {
.bts_fd = -1,
};
static pid_t bts_gettid(void)
{
return syscall(__NR_gettid);
}
static long perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu,
int group_fd, unsigned long flags)
{
return syscall(__NR_perf_event_open, hw_event, (uintptr_t)pid, (uintptr_t)cpu,
(uintptr_t)group_fd, (uintptr_t)flags);
}
/* Opens a BTS perf fd for `tid`. */
static int bts_open_fd(pid_t tid)
{
struct perf_event_attr pe = {
.size = sizeof(pe),
.exclude_kernel = 1,
.disabled = 1,
.exclude_hv = 1,
.type = bts_perf_type,
};
return perf_event_open(&pe, tid, /*cpu=*/-1,
/*group_fd=*/-1, PERF_FLAG_FD_CLOEXEC);
}
void bts_init(int detected_bts_perf_type)
{
bts_perf_type = detected_bts_perf_type;
return;
}
void bts_teardown(void)
{
free(thread_state.linear_result);
thread_state.linear_result = NULL;
if (thread_state.aux_buffer != NULL) {
munmap(thread_state.aux_buffer, thread_state.perf->aux_size);
}
thread_state.aux_buffer = NULL;
if (thread_state.perf != NULL) {
munmap(thread_state.perf, perf_page_size);
}
thread_state.perf = NULL;
if (thread_state.bts_fd >= 0) {
close(thread_state.bts_fd);
}
thread_state.bts_fd = -1;
return;
}
int bts_setup(size_t aux_size)
{
int r;
/* Clean up any current state. */
bts_teardown();
/* Get a BTS perf fd for this thread. */
{
int fd = bts_open_fd(bts_gettid());
if (fd < 0) {
perror("perf_open_fd");
r = fd;
goto fail;
}
thread_state.bts_fd = fd;
}
/* Map in the perf mmap control block. */
{
void *mapped = mmap(NULL, perf_page_size, PROT_READ | PROT_WRITE,
MAP_SHARED, thread_state.bts_fd, 0);
if (mapped == MAP_FAILED) {
perror("perf");
r = -1;
goto fail;
}
thread_state.perf = mapped;
/* Populate the aux metadata fields so we can mmap the aux buffer in. */
thread_state.perf->aux_offset =
thread_state.perf->data_offset + thread_state.perf->data_size;
thread_state.perf->aux_size = aux_size;
r = mprotect(mapped, perf_page_size, PROT_READ);
if (r < 0) {
goto fail;
}
}
/* Map in the auxiliary data ring buffer. */
{
void *mapped = mmap(NULL, thread_state.perf->aux_size,
PROT_READ | PROT_WRITE, MAP_SHARED,
thread_state.bts_fd,
thread_state.perf->aux_offset);
if (mapped == MAP_FAILED) {
perror("aux_buffer");
r = -1;
goto fail;
}
thread_state.aux_buffer = mapped;
}
/* Alocate a buffer large enough to hold a linearised copy of the aux data. */
thread_state.linear_result = malloc(thread_state.perf->aux_size);
if (thread_state.linear_result == NULL) {
perror("malloc");
r = -1;
goto fail;
}
return 0;
fail:
{
int err = errno;
bts_teardown();
errno = err;
}
return r;
}
int bts_start(void)
{
int r;
/*
* Make sure BTS tracing is disabled before messing with the
* ring buffer.
*/
r = ioctl(thread_state.bts_fd, PERF_EVENT_IOC_DISABLE, 0);
if (r < 0) {
perror("perf disable");
return r;
}
/*
* The perf mmap block is usually read-only to let the kernel
* silently overwrite entries in the zero-sized data ring
* buffer.
*
* We need write access to advance the aux read pointer,
* `aux_tail`.
*/
r = mprotect(thread_state.perf, perf_page_size, PROT_READ | PROT_WRITE);
if (r < 0) {
perror("mprotect READ | WRITE");
return r;
}
/* Consume all auxiliary data produced so far. */
thread_state.perf->aux_tail = thread_state.perf->aux_head;
/*
* I don't think this is necessary (there should be no data
* record), but it doesn't hurt to switch the data ring buffer
* to silent overwrite mode.
*/
r = mprotect(thread_state.perf, perf_page_size, PROT_READ);
if (r < 0) {
perror("mprotect READ");
return r;
}
/* This also seems redundant, but honggfuzz does it... */
r = ioctl(thread_state.bts_fd, PERF_EVENT_IOC_RESET, 0);
if (r < 0) {
perror("perf reset");
return r;
}
/*
* Enable BTS tracing as late as possible to minimise noise.
*/
return ioctl(thread_state.bts_fd, PERF_EVENT_IOC_ENABLE, 0);
}
const struct bts_aux_record *bts_stop(ssize_t *OUT_num_elements)
{
const size_t element_size = sizeof(uint64_t) * 3;
int r;
/* Stop BTS tracing ASAP to minimise noise. */
r = ioctl(thread_state.bts_fd, PERF_EVENT_IOC_DISABLE, 0);
*OUT_num_elements = 0;
if (r < 0) {
perror("perf disable");
*OUT_num_elements = r;
return NULL;
}
const uint64_t aux_head = thread_state.perf->aux_head;
const uint64_t aux_tail = thread_state.perf->aux_tail;
const uint64_t aux_size = thread_state.perf->aux_size;
/*
* aux_size is a power of two, so
* (x & aux_mask) == (x % aux_size).
*/
const uint64_t aux_mask = aux_size - 1;
if (aux_head - aux_tail > aux_size) {
fprintf(stderr,
"Auxiliary data overflow despite non-overwrite mode (!?)\n");
*OUT_num_elements = -1;
return NULL;
}
if (aux_head == aux_tail) {
*OUT_num_elements = 0;
return NULL;
}
/*
* Copy complete BTS records from the ring buffer in order.
*/
const char *src = thread_state.aux_buffer;
struct bts_aux_record *dst = thread_state.linear_result;
uint64_t offset = aux_tail;
while (offset < aux_head) {
/*
* Convert the logical monotonic offset to a byte
* index by taking mod aux_size.
*/
size_t index = offset & aux_mask;
/* If there's room for a record, copy it */
if (index <= aux_size - element_size) {
memcpy(dst++, src + index, element_size);
offset += element_size;
} else {
/* ... otherwise, skip over the padding. */
offset += aux_size - index;
}
}
/* Consumption should match the production logic exactly. */
if (offset != aux_head) {
fprintf(stderr, "Unexpected aux packet size.\n");
*OUT_num_elements = -1;
return NULL;
}
*OUT_num_elements = dst - thread_state.linear_result;
return thread_state.linear_result;
}
#ifdef TEST
#include <assert.h>
#include <inttypes.h>
int main (int argc, char**argv)
{
if (argc > 1) {
bts_init(atoi(argv[1]));
} else {
/*
* The value in
* `/sys/bus/event_source/devices/intel_bts/type` on
* my machine.
*/
bts_init(8);
}
int r = bts_setup(256 * PAGE_SIZE);
assert(r == 0);
ssize_t num_entries;
const struct bts_aux_record *entries;
for (size_t j = 0; j < 10; j++) {
r = bts_start();
assert(r == 0);
for (size_t i = 0; i < 1000; i++) {
asm volatile("" : "+r"(i));
}
entries = bts_stop(&num_entries);
printf("num_entries: %zd\n", num_entries);
assert(entries != NULL);
printf("%"PRIx64" %"PRIx64" %"PRIx64"\n",
entries[10].from,
entries[10].to,
entries[10].flags);
printf("%"PRIx64" %"PRIx64" %"PRIx64"\n",
entries[num_entries - 10].from,
entries[num_entries - 10].to,
entries[num_entries - 10].flags);
}
{
r = bts_start();
assert(r == 0);
for (size_t i = 0; i < 10; i++) {
asm volatile("" : "+r"(i));
}
entries = bts_stop(&num_entries);
printf("num_entries: %zd\n", num_entries);
printf("%"PRIx64" %"PRIx64" %"PRIx64"\n",
entries[num_entries - 1].from,
entries[num_entries - 1].to,
entries[num_entries - 1].flags);
}
bts_teardown();
return 0;
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment