sgtcortez/Cache Line.md

## Cache Line.md

      
    Raw
  

              Cache Line.md
            
          
    Summary


Introduction
Shared-Cache
False Shared-Cache

Introduction

This gist is inspired on Scott Meyers video about Processor Cache, and why you should care about it.
Shared-Cache

Contains the source code of a shared cache line among threads.
To compile, run:
clang -g -Wall -pedantic -lpthread -o Shared-Cache.out Shared-Cache.c
False Shared-Cache

Contains the source code of a false shared cache among threads.
To compile, run:
clang -g -Wall -pedantic -lpthread -o False-Shared-Cache.out False-Shared-Cache.c

  
## False-Shared-Cache.c
/**
 * This video https://www.youtube.com/watch?v=WDIkqP4JbkE insipired me to write this code. Just to play with my new acknowledge :)
 * Special Thanks to Scott Meyers!!!
 *
 * Some resources too:
 * https://stackoverflow.com/questions/5248915/execution-time-of-c-program
 * https://stackoverflow.com/questions/11624545/how-to-make-main-thread-wait-for-all-child-threads-finish
 *
 * To compile, run: clang -g -Wall -pedantic -lpthread -o False-Shared-Cache.out False-Shared-Cache.c

*/

#include <time.h>        // clock
#include <stdio.h>       // printf
#include <pthread.h>     // threads stuff
#include <sys/sysinfo.h> // get_nprocs_conf

#define PROCESSORS_AVAILABLE get_nprocs_conf()
#define ITERATIONS 100000000

// thread start function
void * action(void *ptr);
void shared_cache(void);
unsigned long intesive_task(unsigned long value);

int main(
    int argc,
    char **argv
) {

    shared_cache();
    printf("\nProcessors number: %d.\n", PROCESSORS_AVAILABLE);

    return 0;
}

// This is a slow action, because we share the cache line among threads
// and, when a thread writes to the cache line, its invalidates the cache from other threads
// forcing them to go to the main memory to retrieve the line again
void * action(void *ptr) {
    // Converts to a long pointer
    unsigned long *int_ptr = (unsigned long*)ptr;


    // dereferences the integer pointer and assign a new value
    for (int index = 0; index < ITERATIONS; index++) {
        // writes to the line that are in the cache
        // it will invalidate others threads cache since we are sharing the cache line with other threads
        // and, forces them to retrive fresh information from main memory
        *int_ptr = intesive_task(*int_ptr) + *int_ptr;
    }

    return NULL;

}


// tries to do an intensive task
unsigned long intesive_task(unsigned long value) {
    return value * value / 2 * value / 3 * value / 5 * value % 3 * value * value / 2;
}


void shared_cache(void) {

    // Creates threads for all processor available to this program
    pthread_t threads[PROCESSORS_AVAILABLE];

    // since, it is an array, will have multiple elements per cache line
    unsigned long result[PROCESSORS_AVAILABLE];

    // My processor has D1-D cache line of 64 bytes.
    // unsigned long -> 8 bytes
    // | | | | | | | | | | | | | | | | | | | | | |
    // | thread 1    | thread 2    | thread 3    |
    // When reading from main memory, the processor does not read byte per byte, instead,
    // it reads bytes of the size of a cache line
    // so, it will read 64 bytes and, prefetch(it tries to guess that I will need
    // the next 64 later, this is why arrays are so good
    // the next 64 and so on ...

    for(int index = 0; index < PROCESSORS_AVAILABLE; index++) {
        result[index] = index;
    }

    const clock_t tic = clock();

    for(int index = 0; index < PROCESSORS_AVAILABLE; index++) {

        /**
        * Will cause false sharing, since we share the result variable with all threads
        * when one thread write its calculation to the address, the value that other thread is using
        * will be on the same cache line, causing the processor to invalidate their cache line, and then, it will need
        * to retrieve again from the main memory which is too slow comparing to processor cache
        **/
        pthread_create(&threads[index], NULL, action, &(result[index]));
    }

    // Waits for threads to finish
    for(int index = 0; index < PROCESSORS_AVAILABLE; index++) {
        pthread_join(threads[index], NULL);
    }

    const clock_t toc = clock();
    for (int index = 0; index < PROCESSORS_AVAILABLE; index++) {
        printf("Thread: %d result: %ld.\n", index, result[index]);
    }

    printf("Shared Cache False Sharing Time spent: %f seconds ...\n", (double)(toc - tic) / CLOCKS_PER_SEC);
}

## Shared-Cache.c
/**
 * This video https://www.youtube.com/watch?v=WDIkqP4JbkE insipired me to write this code. Just to play with my new acknowledge :)
 * Special Thanks to Scott Meyers!!!
 *
 * Some resources too:
 * https://stackoverflow.com/questions/5248915/execution-time-of-c-program
 * https://stackoverflow.com/questions/11624545/how-to-make-main-thread-wait-for-all-child-threads-finish
 *
 * To compile, run: clang -g -Wall -pedantic -lpthread -o Shared-Cache.out Shared-Cache.c

*/

#include <time.h>        // clock
#include <stdio.h>       // printf
#include <pthread.h>     // threads stuff
#include <sys/sysinfo.h> // get_nprocs_conf

#define PROCESSORS_AVAILABLE get_nprocs_conf()
#define ITERATIONS 100000000

// thread start function
void * action(void *ptr);
void shared_cache(void);
unsigned long intesive_task(unsigned long value);

int main(
    int argc,
    char **argv
) {
    shared_cache();
    printf("\nProcessors number: %d.\n", PROCESSORS_AVAILABLE);
    return 0;
}

// This is a fast action(comparing to the slow action). Even we share the cache line among threads
// we write to the shared cache line only once.
// So, we invalidates the cache line just one time
void * action(void *ptr) {

    // Converts to a long pointer
    unsigned long *int_ptr = (unsigned long*)ptr;

    // Reads do not invalidate the cache. Just writes!
    unsigned long local_sum = *int_ptr;

    for (int index = 0; index < ITERATIONS; index++) {
        // Updates the local variable
        local_sum = intesive_task(local_sum) + local_sum;
    }

    // Access shared cache line only once
    *int_ptr = local_sum;

    return NULL;

}


// tries to do an intensive task
unsigned long intesive_task(unsigned long value) {
    return value * value / 2 * value / 3 * value / 5 * value % 3 * value * value / 2;
}


void shared_cache(void) {
    // Creates threads for all processor available to this program
    pthread_t threads[PROCESSORS_AVAILABLE];

    // multiple elements per cache line
    unsigned long result[PROCESSORS_AVAILABLE];

    // My processor has D1-D cache line of 64 bytes.
    // unsigned long -> 8 bytes
    // | | | | | | | | | | | | | | | | | | | | | |
    // | thread 1    | thread 2    | thread 3    |
    // When reading from main memory, the processor does not read byte per byte, instead,
    // it reads bytes of the size of a cache line
    // so, it will read 64 bytes and, prefetch(it tries to guess that I will need
    // the next 64 later, this is why arrays are so good
    // the next 64 and so on ...

    for(int index = 0; index < PROCESSORS_AVAILABLE; index++) {
        result[index] = index;
    }

    const clock_t tic_cache = clock();


    for(int index = 0; index < PROCESSORS_AVAILABLE; index++) {
        /**
         * In this implementation, every thread will store their calculation inside a local variable.
         * Since, every thread has its own stack, their writes will not affect other threads ...
         * And, will invalidate the cache line only once.
        **/
        pthread_create(&threads[index], NULL, action, &(result[index]));
    }

    // Waits for threads to finish
    for(int index = 0; index < PROCESSORS_AVAILABLE; index++) {
        pthread_join(threads[index], NULL);
    }

    const clock_t toc_cache = clock();

    for (int index = 0; index < PROCESSORS_AVAILABLE; index++) {
        printf("Thread: %d result: %ld.\n", index, result[index]);
    }

    printf("Time spent: %f seconds ...\n", (double)(toc_cache - tic_cache) / CLOCKS_PER_SEC);
}
	/**
	* This video https://www.youtube.com/watch?v=WDIkqP4JbkE insipired me to write this code. Just to play with my new acknowledge :)
	* Special Thanks to Scott Meyers!!!
	*
	* Some resources too:
	* https://stackoverflow.com/questions/5248915/execution-time-of-c-program
	* https://stackoverflow.com/questions/11624545/how-to-make-main-thread-wait-for-all-child-threads-finish
	*
	* To compile, run: clang -g -Wall -pedantic -lpthread -o False-Shared-Cache.out False-Shared-Cache.c

	*/

	#include <time.h> // clock
	#include <stdio.h> // printf
	#include <pthread.h> // threads stuff
	#include <sys/sysinfo.h> // get_nprocs_conf

	#define PROCESSORS_AVAILABLE get_nprocs_conf()
	#define ITERATIONS 100000000

	// thread start function
	void * action(void *ptr);
	void shared_cache(void);
	unsigned long intesive_task(unsigned long value);

	int main(
	int argc,
	char **argv
	) {

	shared_cache();
	printf("\nProcessors number: %d.\n", PROCESSORS_AVAILABLE);

	return 0;
	}

	// This is a slow action, because we share the cache line among threads
	// and, when a thread writes to the cache line, its invalidates the cache from other threads
	// forcing them to go to the main memory to retrieve the line again
	void * action(void *ptr) {
	// Converts to a long pointer
	unsigned long int_ptr = (unsigned long)ptr;


	// dereferences the integer pointer and assign a new value
	for (int index = 0; index < ITERATIONS; index++) {
	// writes to the line that are in the cache
	// it will invalidate others threads cache since we are sharing the cache line with other threads
	// and, forces them to retrive fresh information from main memory
	int_ptr = intesive_task(int_ptr) + *int_ptr;
	}

	return NULL;

	}


	// tries to do an intensive task
	unsigned long intesive_task(unsigned long value) {
	return value * value / 2 * value / 3 * value / 5 * value % 3 * value * value / 2;
	}


	void shared_cache(void) {

	// Creates threads for all processor available to this program
	pthread_t threads[PROCESSORS_AVAILABLE];

	// since, it is an array, will have multiple elements per cache line
	unsigned long result[PROCESSORS_AVAILABLE];

	// My processor has D1-D cache line of 64 bytes.
	// unsigned long -> 8 bytes
	// \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \|
	// \| thread 1 \| thread 2 \| thread 3 \|
	// When reading from main memory, the processor does not read byte per byte, instead,
	// it reads bytes of the size of a cache line
	// so, it will read 64 bytes and, prefetch(it tries to guess that I will need
	// the next 64 later, this is why arrays are so good
	// the next 64 and so on ...

	for(int index = 0; index < PROCESSORS_AVAILABLE; index++) {
	result[index] = index;
	}

	const clock_t tic = clock();

	for(int index = 0; index < PROCESSORS_AVAILABLE; index++) {

	/**
	* Will cause false sharing, since we share the result variable with all threads
	* when one thread write its calculation to the address, the value that other thread is using
	* will be on the same cache line, causing the processor to invalidate their cache line, and then, it will need
	* to retrieve again from the main memory which is too slow comparing to processor cache
	**/
	pthread_create(&threads[index], NULL, action, &(result[index]));
	}

	// Waits for threads to finish
	for(int index = 0; index < PROCESSORS_AVAILABLE; index++) {
	pthread_join(threads[index], NULL);
	}

	const clock_t toc = clock();
	for (int index = 0; index < PROCESSORS_AVAILABLE; index++) {
	printf("Thread: %d result: %ld.\n", index, result[index]);
	}

	printf("Shared Cache False Sharing Time spent: %f seconds ...\n", (double)(toc - tic) / CLOCKS_PER_SEC);
	}