Skip to content

Instantly share code, notes, and snippets.

@s-macke
Last active May 9, 2022 19:40
Show Gist options
  • Save s-macke/08c8c603ec7c034eb4af49b825c49861 to your computer and use it in GitHub Desktop.
Save s-macke/08c8c603ec7c034eb4af49b825c49861 to your computer and use it in GitHub Desktop.
The code measures the direct and indirect cost (L2 cache misses) of context switches
// compile with
// 'gcc -O2 main.c -o contextperf.c' -lpthread
#define _GNU_SOURCE
#include <stdio.h>
#include <stdint.h>
#include <pthread.h>
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/errno.h>
#include <time.h>
// set used RAM per thread. Default is 12 MB. Can be changed by program argument
int64_t SIZE = 12 * 1024 * 1024;
// ---------------------------------------------------------------
// get cpu clock counter
int64_t rdtsc() {
unsigned int lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (int64_t) (((uint64_t) hi << 32) | lo);
}
// get time in nanoseconds
int64_t getTime() {
struct timespec ts;
timespec_get(&ts, TIME_UTC);
char buff[100];
//strftime(buff, sizeof buff, "%D %T", gmtime(&ts.tv_sec));
//printf("Current time: %s.%09ld UTC\n", buff, ts.tv_nsec);
return ts.tv_nsec + ts.tv_sec * 1000000000;
}
// ---------------------------------------------------------------
int stick_this_thread_to_core(int core_id) {
int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
if (core_id < 0 || core_id >= num_cores)
return EINVAL;
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core_id, &cpuset);
pthread_t current_thread = pthread_self();
return pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
}
// ---------------------------------------------------------------
// some random calculation, which uses a specific amount of RAM. Read and write to RAM.
// Returns the number of clock cycles needed.
int64_t __attribute__ ((noinline)) calculate(char *data) {
int64_t clock = rdtsc();
for (int i = 0; i < SIZE; i++) {
data[i] *= data[SIZE - i - 1];
}
int64_t delta = rdtsc() - clock;
return delta;
}
// ---------------------------------------------------------------
struct {
int64_t shortest_time; // shortest number of clock cycles, the calculation took. As baseline for the other calculations
int64_t switch_direct; // clock cycles, the calculation switched to another thread
int64_t switch0; // clock cycles, the calculation took on first run
int64_t switch1; // clock cycles, the calculation took on second run. Assumed to be just the same as "shortest_time" variable
int64_t n; // number of measurements
double cpu_frequency; // CPU frequency in GHz
} statistics;
// ---------------------------------------------------------------
volatile int64_t cycles_before = 0; // the clock cycle before the thread switch
// ---------------------------------------------------------------
// mutex and condition to control the thread switching
pthread_mutex_t ready_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t ready_cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
int ready_flag[2] = {0, 0};
// ---------------------------------------------------------------
void *thread(void *vargp) {
int threadid = *(int *) vargp;
char *data;
data = malloc(SIZE);
statistics.switch0 = 0;
statistics.switch1 = 0;
statistics.switch_direct = 0;
statistics.n = 0;
// warmup 1, allow the kernel to assign the RAM
calculate(data);
//stick_this_thread_to_core(threadid);
stick_this_thread_to_core(0);
// warmup 2. calculate on core 0
calculate(data);
printf("thread %i ready\n", threadid);
for (int j = 0; j < 100; j++) { // average of 100 measurements
pthread_mutex_lock(&ready_mutex);
while (!ready_flag[threadid]) {
pthread_cond_wait(&ready_cond[threadid], &ready_mutex);
}
ready_flag[threadid] = 0;
statistics.switch_direct += (rdtsc() - cycles_before); //the clock cycles the other thread switched to this thread
statistics.switch0 += calculate(data);
statistics.switch1 += statistics.shortest_time;
statistics.n++;
// tell the other thread to run. Because of the mutex, the other thread will not run before this thread is done.
ready_flag[1 - threadid] = 1;
pthread_cond_signal(&ready_cond[1 - threadid]);
if (statistics.n != 0) {
//int64_t lost = (switch0 - switch1)/n;
int64_t lost = statistics.switch0 / statistics.n - statistics.shortest_time;
printf("%3i: thread=%i, direct_clock= %li cycles, indirect_clock= %li cycles cpu_frequency= %lf GHz\n", j, threadid, statistics.switch_direct / statistics.n, lost, statistics.cpu_frequency);
}
// warmup 3, don't trust the first 10 measurements
if (j == 10) {
statistics.switch0 = 0;
statistics.switch1 = 0;
statistics.switch_direct = 0;
statistics.n = 0;
}
cycles_before = rdtsc(); // store the clock cycle before the thread switch
pthread_mutex_unlock(&ready_mutex); // switch to the other thread
}
// end of the program
free(data);
return NULL;
}
// ---------------------------------------------------------------
// determine shortest number of clock cycles, the calculation takes
int64_t fastest() {
int64_t clock = INT64_MAX;
char *data = malloc(SIZE);
if (data == NULL) {
printf("malloc failed\n");
exit(1);
}
for (int i = 0; i < 30; i++) {
int64_t delta = calculate(data);
if (delta < clock) clock = delta;
}
free(data);
return clock;
}
// determine cpu frequency and store it in the global variable "cpu_frequency"
void getFastestRun() {
int64_t starttime = getTime();
int64_t startclock = rdtsc();
stick_this_thread_to_core(0);
statistics.shortest_time = fastest();
statistics.shortest_time = fastest();
printf("shortest=%li\n", statistics.shortest_time);
statistics.cpu_frequency = (double)(rdtsc() - startclock) / (double) (getTime() - starttime);
printf("%lf GHz\n", statistics.cpu_frequency);
}
// ---------------------------------------------------------------
int main(int argc, char *argv[]) {
// allow to change the amount of RAM used by the threads
if (argc == 2) {
SIZE = atoi(argv[1]) * 1024 * 1024;
if (SIZE == 0) {
printf("Error: argument must be a integer larger than 0\n");
return 1;
}
}
printf("RAM usage per thread: %liMB\n", SIZE / 1024 / 1024);
getFastestRun(); // determine the CPU frequency and the shortest number of clock cycles for the calculation.
// create the threads
pthread_t thread_id1;
int id0 = 0;
pthread_create(&thread_id1, NULL, thread, &id0);
pthread_t thread_id2;
int id1 = 1;
pthread_create(&thread_id2, NULL, thread, &id1);
// send thread 0 the signat to start
cycles_before = rdtsc();
ready_flag[0] = 1;
pthread_cond_signal(&ready_cond[0]);
// wait for the threads to finish
pthread_join(thread_id2, NULL);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment