tdryer/00-README.md

## 00-README.md

      
    Raw
  

              00-README.md
            
          
    How Linux approximates memory metrics

Example

The program threads-memory.c (included below) starts 100 threads, allocates 1
MB of memory in each, and then pauses. How much memory is it using?
Let's find out by running it:
$ gcc -pthread threads-memory.c -o threads-memory
$ ./threads-memory
starting threads... done
press enter to exit

While it's still running, check the RSS (resident set size) using ps. On my
Linux system, the result is:
$ ps -o rss -C threads-memory
  RSS
81084

It's only using 81 MB! How could it possibly be using less than 100 MB?
Explanation

Starting in Linux 2.6.34, the value reported by ps is an approximation:

For making accounting scalable, RSS related information are handled in
asynchronous manner and the vaule [sic] may not be very precise. To see a
precise snapshot of a moment, you can see /proc/<pid>/smaps file and scan
page table. It's slow but very precise.

Let's try to understand this change.
In Linux, threads are just processes that happen to share the same address
space (memory). The struct task_struct represents a process, and the struct
mm_struct represents an address space. mm_struct contains a counter
tracking the RSS. This is the value used by ps.
Having every thread access the same mm_struct every time memory is allocated
would be inefficient. The optimization adds a per-thread cache for the counter
in task_struct. Each cache is flushed to the associated mm_struct once
every 64 page faults in a thread. Assuming a 4 KB page size, this means that up
to 252 KB (64 * 4 KB) may be unaccounted for. Probably not a big deal, unless
you're running a lot of threads!
To get a precise RSS value, you can use the pmap command instead, which scans
the page table instead of using the RSS counter:
$ pmap -x $(pidof threads-memory) | grep -E "Address|total"
Address           Kbytes     RSS   Dirty Mode  Mapping
total kB         2960712  105424  103888


## threads-memory.c
#include <pthread.h>
#include <stdatomic.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define NUM_THREADS 100
#define BYTES_PER_THREAD 1024 * 1024  // 1 MB

void *child_main(void *num_threads_ready) {
    char *data = malloc(BYTES_PER_THREAD);
    if (data == NULL) {
        fputs("failed to allocate memory", stderr);
        exit(1);
    }
    for (int i = 0; i < BYTES_PER_THREAD; i++) {
        data[i] = 0;
    }
    atomic_fetch_add((atomic_uint *)num_threads_ready, 1);
    pause();
    return 0;
}

int main() {
    atomic_uint num_threads_ready;
    atomic_init(&num_threads_ready, 0);
    pthread_t threads[NUM_THREADS];
    fputs("starting threads... ", stderr);
    for (int i = 0; i < NUM_THREADS; i++) {
        if (pthread_create(&threads[i], NULL, child_main, &num_threads_ready)) {
            fputs("failed to create thread\n", stderr);
            exit(1);
        }
    }
    while (atomic_load(&num_threads_ready) < NUM_THREADS) {
        sleep(1);
    }
    fputs("done\npress enter to exit", stderr);
    getchar();
    return 0;
}
	#include <pthread.h>
	#include <stdatomic.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>

	#define NUM_THREADS 100
	#define BYTES_PER_THREAD 1024 * 1024 // 1 MB

	void child_main(void num_threads_ready) {
	char *data = malloc(BYTES_PER_THREAD);
	if (data == NULL) {
	fputs("failed to allocate memory", stderr);
	exit(1);
	}
	for (int i = 0; i < BYTES_PER_THREAD; i++) {
	data[i] = 0;
	}
	atomic_fetch_add((atomic_uint *)num_threads_ready, 1);
	pause();
	return 0;
	}

	int main() {
	atomic_uint num_threads_ready;
	atomic_init(&num_threads_ready, 0);
	pthread_t threads[NUM_THREADS];
	fputs("starting threads... ", stderr);
	for (int i = 0; i < NUM_THREADS; i++) {
	if (pthread_create(&threads[i], NULL, child_main, &num_threads_ready)) {
	fputs("failed to create thread\n", stderr);
	exit(1);
	}
	}
	while (atomic_load(&num_threads_ready) < NUM_THREADS) {
	sleep(1);
	}
	fputs("done\npress enter to exit", stderr);
	getchar();
	return 0;
	}