fmerlin/memory_timings.c

## memory_timings.c
#define buf_size 64*1024*1024
#define step 1

/* Memory Timings
 * Compiling the two programs and changing the step from 1 to 16 has little impact on the performance while much more memory is changed
 * With step=1 it takes 1.06s (3Ghz processor)
 * With step=16 it takes 0.93s
 * The two programs have almost the same execution time but the second one does 16 times less multiplications.
 * This is due to the fact that the cache line size is 64 bytes and both programs use the same number of cache lines.
 * The difference is due to the number of multiplication 1.06s - 0.93s = 0.15s = 15 (more multiplications) * 2 * 64M (times) / 3Ghz / (4 mul per clock)
 */

int buf[buf_size];

void main() {
    for(int j=0;j<30;j++)
       for(int i=0;i<buf_size;i+=step)
           buf[i] *= 3;
}

## rfo_timing.c
#define _GNU_SOURCE
#include<pthread.h>
#include<sched.h>
#define nb_sampling 100*1000*1000
#define RFO_TIMING
#define SOCKET_REF 2
/*
The Intel architecture uses the MESI protocol for cache coherency, a cache line can have 4 states:
M : Modified => Owned and Dirty
E : Exclusive => Owned and Clean
S : Shared => Not Owned and Clean
I : Invalid => Not Owned and Dirty
A line is owned when only one core owns the cache line, the other cores must invalidate the corresponding cache line (if they use it).
A line is dirty when it doesn't correspond to the main memory.
Those states can change using request sent on a specific bus (QPI:Quick Path Interconnect for Intel).
When a core wants to read data from a cache line it sends a RD (Read Data) request then the cache line can be any state but Invalid.
When a core wants to write data to a cache line it sends a RFO (Read For Ownership) request then the cache line must be M or E.

Test
Here is a code sample for RFO timing using pthread and __sync_synchronize (memory barrier). We are going to let 2 threads change 100 millions times the same cache line. In C++, volatile does not imply a full memory barrier but only the pointer cannot be allocated to a cpu register (to avoid optimization and more specifically memory ordering issues).

Timing for cache line swap on the same cpu socket
Then we compile and run:
gcc -O3 -std=c99 test3.c -o test3 -lpthread
time ./test3
We have 7s user time, this means 35ns for 2x100 millions modifications
Timing without cache line swap
Then we can change the variable alignment int a different file so that the variables are no longer in the same cache line.

We have 2.32s user time, this means 11ns for 2x100 millions modifications

Timing without cache coherency protocol
To do it, just comment out the RFO_TIMING define
One very important point also is the cost of __sync_synchronize()
If we remove the lines with __sync_synchronize():
We have 0.59s user time, this means 3ns for 2x100 millions modifications which is cache L1 latency.
Without __sync_synchronize, the memory access is done through the memory ordering buffer (read and write buffers) where the cache coherency protocol is not used, the processor doesn't have to wait for the L1 cache to be ready but the number of operation is still limited by L1 cache access.

Timing for cache line swap on different cpu sockets
Change SOCKET_REF from 2 to 1
The physical id for a cpu can be found with 'less /proc/cpuinfo'.
We have 9.47s for user time, this means 47ns for 2x100 millions modifications, this is 50% worse than the first case.

*/

// to comment out for the reference test
#ifdef RFO_TIMING
volatile int a;
volatile int b;
#else
volatile int a __attribute__((aligned(64)));
volatile int b __attribute__((aligned(64)));
#endif
void* loop_a(void* arg)
{
    for(int i=0;i<nb_sampling;i++) {
        a++;
        __sync_synchronize();
    }
}
void* loop_b(void* arg)
{
    for(int i=0;i<nb_sampling;i++) {
        b++;
        __sync_synchronize();
    }
}
void main() {
    pthread_t thrd;
    cpu_set_t mask;
    pthread_attr_t attr;
    CPU_ZERO(&mask);
    CPU_SET(0,&mask);
    pthread_attr_init(&attr);
    pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &mask);
    pthread_create(&thrd,&attr,loop_a,NULL);
    pthread_t thrd2;
    cpu_set_t mask2;
    pthread_attr_t attr2;
    CPU_ZERO(&mask2);
    CPU_SET(SOCKET_REF,&mask2);
    pthread_attr_init(&attr2);
    pthread_attr_setaffinity_np(&attr2, sizeof(cpu_set_t), &mask2);
    pthread_create(&thrd2,&attr2,loop_b,NULL);
    pthread_join(thrd,NULL);
    pthread_join(thrd2,NULL);
}

## spinlock.c
#include <pthread.h>
#include <vector>

/*
A lock, and more precisely a spinlock, has 2 states locked or unlocked. Generally the code for a spinlock is very short:
lock_addr:
    dd 0

spin_lock:
    mov 0, eax // accumulator register = 0
    lock cmpxchg 1, [lock_addr] // compare accumulator with lock_addr, if equal then exchange 1 with lock_addr
    jnz spin_lock // jump if previous comparison was not equal (jump if not zero, zero is for the zero flag)
    ret // return

spin_unlock:
    lock mov 0, [lock_addr] // lock_addr = 0
    ret

The lock prefix make sure that the cmpxchg operation is atomic and does not use any read buffer.
The spinlock has one major drawback, it will not put the current thread in sleep mode if the lock is not acquired. The code will loop (thus a spin lock) until the lock has been acquired. The only reason to use it is when the lock has to be acquired for a very short period of time or in kernel mode.
The mutex is an enhanced spinlock that makes the current thread sleep if the lock is not acquired. It's a kernel function.
The pthread library has methods for spinlocks and mutexes.
pthread_spin_init() / pthread_mutex_init()
pthread_spin_lock() / pthread_mutex_lock()
pthread_spin_unlock() / pthread_mutex_unlock()
pthread_spin_destroy() / pthread_mutex_destroy()

Test
The goal is to write the same test for spinlock and mutex. The example shows a list being filled up 20 million times, the add method is guarded by a lock and a fake check (always return true) is done before to simulate some computation.

In order to test the mutex version, replace the types/functions with _spin_ to _mutex_

The timings give
time ./lock => 2.95s user time for the spinlock version which means 147ns per loop iteration
time ./lock2 => 3.80s user time for the mutex version which means 190ns per loop iteration

If we remove all the lock operations (and put the vector variable on the stack)

The timings give
time ./lock3 => 1.44s user time for the no lock version  which means 72 ns  per loop iteration
*/

pthread_spinlock_t lock;
pthread_t thr1, thr2;
cpu_set_t mask1,mask2;
pthread_attr_t attr1,attr2;
std::vector<int> lst;
int check(int i) {
  int res = 0;
  for(int j=1;j<50;j++)
    res += res*i + j;
  return res % 2;
}
void *consumer(void *ptr)
{
    for (int i=0;i<10*1000*1000;i++)
        if(check(i)) {
           pthread_spin_lock(&lock);
           lst.push_back(i);
           pthread_spin_unlock(&lock);
        }
}
int main()
{
    pthread_spin_init(&lock, 0);
    CPU_ZERO(&mask1);
    CPU_ZERO(&mask2);
    CPU_SET(0,&mask1);
    CPU_SET(2,&mask2);
    pthread_attr_init(&attr1);
    pthread_attr_init(&attr2);
    pthread_attr_setaffinity_np(&attr1, sizeof(cpu_set_t), &mask1);
    pthread_attr_setaffinity_np(&attr2, sizeof(cpu_set_t), &mask2);
    pthread_create(&thr1, &attr1, consumer, NULL);
    pthread_create(&thr2, &attr2, consumer, NULL);
    pthread_join(thr1, NULL);
    pthread_join(thr2, NULL);
    pthread_spin_destroy(&lock);
}
	#define buf_size 6410241024
	#define step 1

	/* Memory Timings
	* Compiling the two programs and changing the step from 1 to 16 has little impact on the performance while much more memory is changed
	* With step=1 it takes 1.06s (3Ghz processor)
	* With step=16 it takes 0.93s
	* The two programs have almost the same execution time but the second one does 16 times less multiplications.
	* This is due to the fact that the cache line size is 64 bytes and both programs use the same number of cache lines.
	* The difference is due to the number of multiplication 1.06s - 0.93s = 0.15s = 15 (more multiplications) * 2 * 64M (times) / 3Ghz / (4 mul per clock)
	*/

	int buf[buf_size];

	void main() {
	for(int j=0;j<30;j++)
	for(int i=0;i<buf_size;i+=step)
	buf[i] *= 3;
	}
	#define _GNU_SOURCE
	#include<pthread.h>
	#include<sched.h>
	#define nb_sampling 10010001000
	#define RFO_TIMING
	#define SOCKET_REF 2
	/*
	The Intel architecture uses the MESI protocol for cache coherency, a cache line can have 4 states:
	M : Modified => Owned and Dirty
	E : Exclusive => Owned and Clean
	S : Shared => Not Owned and Clean
	I : Invalid => Not Owned and Dirty
	A line is owned when only one core owns the cache line, the other cores must invalidate the corresponding cache line (if they use it).
	A line is dirty when it doesn't correspond to the main memory.
	Those states can change using request sent on a specific bus (QPI:Quick Path Interconnect for Intel).
	When a core wants to read data from a cache line it sends a RD (Read Data) request then the cache line can be any state but Invalid.
	When a core wants to write data to a cache line it sends a RFO (Read For Ownership) request then the cache line must be M or E.

	Test
	Here is a code sample for RFO timing using pthread and __sync_synchronize (memory barrier). We are going to let 2 threads change 100 millions times the same cache line. In C++, volatile does not imply a full memory barrier but only the pointer cannot be allocated to a cpu register (to avoid optimization and more specifically memory ordering issues).

	Timing for cache line swap on the same cpu socket
	Then we compile and run:
	gcc -O3 -std=c99 test3.c -o test3 -lpthread
	time ./test3
	We have 7s user time, this means 35ns for 2x100 millions modifications
	Timing without cache line swap
	Then we can change the variable alignment int a different file so that the variables are no longer in the same cache line.

	We have 2.32s user time, this means 11ns for 2x100 millions modifications

	Timing without cache coherency protocol
	To do it, just comment out the RFO_TIMING define
	One very important point also is the cost of __sync_synchronize()
	If we remove the lines with __sync_synchronize():
	We have 0.59s user time, this means 3ns for 2x100 millions modifications which is cache L1 latency.
	Without __sync_synchronize, the memory access is done through the memory ordering buffer (read and write buffers) where the cache coherency protocol is not used, the processor doesn't have to wait for the L1 cache to be ready but the number of operation is still limited by L1 cache access.

	Timing for cache line swap on different cpu sockets
	Change SOCKET_REF from 2 to 1
	The physical id for a cpu can be found with 'less /proc/cpuinfo'.
	We have 9.47s for user time, this means 47ns for 2x100 millions modifications, this is 50% worse than the first case.

	*/

	// to comment out for the reference test
	#ifdef RFO_TIMING
	volatile int a;
	volatile int b;
	#else
	volatile int a __attribute__((aligned(64)));
	volatile int b __attribute__((aligned(64)));
	#endif
	void* loop_a(void* arg)
	{
	for(int i=0;i<nb_sampling;i++) {
	a++;
	__sync_synchronize();
	}
	}
	void* loop_b(void* arg)
	{
	for(int i=0;i<nb_sampling;i++) {
	b++;
	__sync_synchronize();
	}
	}
	void main() {
	pthread_t thrd;
	cpu_set_t mask;
	pthread_attr_t attr;
	CPU_ZERO(&mask);
	CPU_SET(0,&mask);
	pthread_attr_init(&attr);
	pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &mask);
	pthread_create(&thrd,&attr,loop_a,NULL);
	pthread_t thrd2;
	cpu_set_t mask2;
	pthread_attr_t attr2;
	CPU_ZERO(&mask2);
	CPU_SET(SOCKET_REF,&mask2);
	pthread_attr_init(&attr2);
	pthread_attr_setaffinity_np(&attr2, sizeof(cpu_set_t), &mask2);
	pthread_create(&thrd2,&attr2,loop_b,NULL);
	pthread_join(thrd,NULL);
	pthread_join(thrd2,NULL);
	}
	#include <pthread.h>
	#include <vector>

	/*
	A lock, and more precisely a spinlock, has 2 states locked or unlocked. Generally the code for a spinlock is very short:
	lock_addr:
	dd 0

	spin_lock:
	mov 0, eax // accumulator register = 0
	lock cmpxchg 1, [lock_addr] // compare accumulator with lock_addr, if equal then exchange 1 with lock_addr
	jnz spin_lock // jump if previous comparison was not equal (jump if not zero, zero is for the zero flag)
	ret // return

	spin_unlock:
	lock mov 0, [lock_addr] // lock_addr = 0
	ret

	The lock prefix make sure that the cmpxchg operation is atomic and does not use any read buffer.
	The spinlock has one major drawback, it will not put the current thread in sleep mode if the lock is not acquired. The code will loop (thus a spin lock) until the lock has been acquired. The only reason to use it is when the lock has to be acquired for a very short period of time or in kernel mode.
	The mutex is an enhanced spinlock that makes the current thread sleep if the lock is not acquired. It's a kernel function.
	The pthread library has methods for spinlocks and mutexes.
	pthread_spin_init() / pthread_mutex_init()
	pthread_spin_lock() / pthread_mutex_lock()
	pthread_spin_unlock() / pthread_mutex_unlock()
	pthread_spin_destroy() / pthread_mutex_destroy()

	Test
	The goal is to write the same test for spinlock and mutex. The example shows a list being filled up 20 million times, the add method is guarded by a lock and a fake check (always return true) is done before to simulate some computation.

	In order to test the mutex version, replace the types/functions with _spin_ to _mutex_

	The timings give
	time ./lock => 2.95s user time for the spinlock version which means 147ns per loop iteration
	time ./lock2 => 3.80s user time for the mutex version which means 190ns per loop iteration

	If we remove all the lock operations (and put the vector variable on the stack)

	The timings give
	time ./lock3 => 1.44s user time for the no lock version which means 72 ns per loop iteration
	*/

	pthread_spinlock_t lock;
	pthread_t thr1, thr2;
	cpu_set_t mask1,mask2;
	pthread_attr_t attr1,attr2;
	std::vector<int> lst;
	int check(int i) {
	int res = 0;
	for(int j=1;j<50;j++)
	res += res*i + j;
	return res % 2;
	}
	void consumer(void ptr)
	{
	for (int i=0;i<1010001000;i++)
	if(check(i)) {
	pthread_spin_lock(&lock);
	lst.push_back(i);
	pthread_spin_unlock(&lock);
	}
	}
	int main()
	{
	pthread_spin_init(&lock, 0);
	CPU_ZERO(&mask1);
	CPU_ZERO(&mask2);
	CPU_SET(0,&mask1);
	CPU_SET(2,&mask2);
	pthread_attr_init(&attr1);
	pthread_attr_init(&attr2);
	pthread_attr_setaffinity_np(&attr1, sizeof(cpu_set_t), &mask1);
	pthread_attr_setaffinity_np(&attr2, sizeof(cpu_set_t), &mask2);
	pthread_create(&thr1, &attr1, consumer, NULL);
	pthread_create(&thr2, &attr2, consumer, NULL);
	pthread_join(thr1, NULL);
	pthread_join(thr2, NULL);
	pthread_spin_destroy(&lock);
	}