|
/** |
|
* This video https://www.youtube.com/watch?v=WDIkqP4JbkE insipired me to write this code. Just to play with my new acknowledge :) |
|
* Special Thanks to Scott Meyers!!! |
|
* |
|
* Some resources too: |
|
* https://stackoverflow.com/questions/5248915/execution-time-of-c-program |
|
* https://stackoverflow.com/questions/11624545/how-to-make-main-thread-wait-for-all-child-threads-finish |
|
* |
|
* To compile, run: clang -g -Wall -pedantic -lpthread -o False-Shared-Cache.out False-Shared-Cache.c |
|
|
|
*/ |
|
|
|
#include <time.h> // clock |
|
#include <stdio.h> // printf |
|
#include <pthread.h> // threads stuff |
|
#include <sys/sysinfo.h> // get_nprocs_conf |
|
|
|
#define PROCESSORS_AVAILABLE get_nprocs_conf() |
|
#define ITERATIONS 100000000 |
|
|
|
// thread start function |
|
void * action(void *ptr); |
|
void shared_cache(void); |
|
unsigned long intesive_task(unsigned long value); |
|
|
|
int main( |
|
int argc, |
|
char **argv |
|
) { |
|
|
|
shared_cache(); |
|
printf("\nProcessors number: %d.\n", PROCESSORS_AVAILABLE); |
|
|
|
return 0; |
|
} |
|
|
|
// This is a slow action, because we share the cache line among threads |
|
// and, when a thread writes to the cache line, its invalidates the cache from other threads |
|
// forcing them to go to the main memory to retrieve the line again |
|
void * action(void *ptr) { |
|
// Converts to a long pointer |
|
unsigned long *int_ptr = (unsigned long*)ptr; |
|
|
|
|
|
// dereferences the integer pointer and assign a new value |
|
for (int index = 0; index < ITERATIONS; index++) { |
|
// writes to the line that are in the cache |
|
// it will invalidate others threads cache since we are sharing the cache line with other threads |
|
// and, forces them to retrive fresh information from main memory |
|
*int_ptr = intesive_task(*int_ptr) + *int_ptr; |
|
} |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
|
|
// tries to do an intensive task |
|
unsigned long intesive_task(unsigned long value) { |
|
return value * value / 2 * value / 3 * value / 5 * value % 3 * value * value / 2; |
|
} |
|
|
|
|
|
void shared_cache(void) { |
|
|
|
// Creates threads for all processor available to this program |
|
pthread_t threads[PROCESSORS_AVAILABLE]; |
|
|
|
// since, it is an array, will have multiple elements per cache line |
|
unsigned long result[PROCESSORS_AVAILABLE]; |
|
|
|
// My processor has D1-D cache line of 64 bytes. |
|
// unsigned long -> 8 bytes |
|
// | | | | | | | | | | | | | | | | | | | | | | |
|
// | thread 1 | thread 2 | thread 3 | |
|
// When reading from main memory, the processor does not read byte per byte, instead, |
|
// it reads bytes of the size of a cache line |
|
// so, it will read 64 bytes and, prefetch(it tries to guess that I will need |
|
// the next 64 later, this is why arrays are so good |
|
// the next 64 and so on ... |
|
|
|
for(int index = 0; index < PROCESSORS_AVAILABLE; index++) { |
|
result[index] = index; |
|
} |
|
|
|
const clock_t tic = clock(); |
|
|
|
for(int index = 0; index < PROCESSORS_AVAILABLE; index++) { |
|
|
|
/** |
|
* Will cause false sharing, since we share the result variable with all threads |
|
* when one thread write its calculation to the address, the value that other thread is using |
|
* will be on the same cache line, causing the processor to invalidate their cache line, and then, it will need |
|
* to retrieve again from the main memory which is too slow comparing to processor cache |
|
**/ |
|
pthread_create(&threads[index], NULL, action, &(result[index])); |
|
} |
|
|
|
// Waits for threads to finish |
|
for(int index = 0; index < PROCESSORS_AVAILABLE; index++) { |
|
pthread_join(threads[index], NULL); |
|
} |
|
|
|
const clock_t toc = clock(); |
|
for (int index = 0; index < PROCESSORS_AVAILABLE; index++) { |
|
printf("Thread: %d result: %ld.\n", index, result[index]); |
|
} |
|
|
|
printf("Shared Cache False Sharing Time spent: %f seconds ...\n", (double)(toc - tic) / CLOCKS_PER_SEC); |
|
} |