polytypic/article-simplified-msi-model-of-shared-memory.md

## article-simplified-msi-model-of-shared-memory.md

      
    Raw
  

              article-simplified-msi-model-of-shared-memory.md
            
          
    How can one understand and predict the performance of multicore algorithms?
As a first step, I'd recommend acquiring a basic understanding of the
MSI protocol. Modern systems use
somewhat more complex protocols, but a basic understanding of MSI should be
enough to predict the most important phenomena.
Below is an even further simplified model of shared memory based on the MSI
protocol. While the model is not precise, and makes several simplifications not
based on actual hardware, it should be accurate enough to predict many
phenomena, including the
false sharing -performance
pitfall, for example.
Simplified MSI model of shared memory

The memory is divided into locations called cache lines. Each core has its own
copy of the whole memory (all cache lines) and additionally considers each of
its own cache lines to be in one of the following three states:

Modified
Shared
Invalid

State changes do not happen spontaneously. Some core needs to perform a read
from or a write to a cache line.
When a cache line of a core is in the Modified state, both reads and writes
are cheap. The state of the cache line needs to be Invalid in all the other
cores and neither read nor write require changing the state of the cache line in
those other cores.
Reading a cache line in the Shared state is cheap. Writing to a cache line in
the Shared state is more expensive. It means that the state of the cache line
in the core that performs the write is changed to the Modified state.
Additionally, the state of the corresponding cache line in all the other cores
is changed to the Invalid state.
When a core wants to read a cache line in the Invalid state, it needs another
core that has that cache line in either Modified or Shared state to send a
copy. If the sender was in Modified state, then the sender must change state
to Shared. The reader also sets the state to Shared. Writing to an Invalid
cache line also requires another core that has that cache line in Modified or
Shared state to send a copy of the cache line and then all the cores that have
the cache line in Modified or Shared state must change their state to
Invalid. The writer sets the state to Modified.
The various state changes and transmissions of cache line contents have
potentially different costs. Transitions from the Invalid state to other
states tend to be expensive.
While drastically simplified, this model should be accurate enough to help to
predict the performance of many multicore algorithms. To predict the performance
of a specific algorithm relative to another, determine and count the different
state changes and cache line transfers for both algorithms.
Concrete timings

This gist also includes a program written for Apple M1 to try to quantify the
relative costs of various state changes. On my Apple M1 laptop I get roughly the
following output from the program:
Relative to fastest:

  Read  M:     1.00 *
  Write M:     2.58 ***

  Read  S:     1.04 *
  Write S:    85.40 *************************************************************************************

  Read  Is:    2.36 **
  Write Is:   31.34 *******************************

  Read  Im:    5.02 *****
  Write Im:   32.61 *********************************

Note that the program uses constants (related to L1 cache line size and cache
size) that are specific to Apple M1.

  
## dune
(tests
 (names Quantify_msi)
 (libraries unix multicore-magic))

## dune-project
(lang dune 3.3)

## quantify_msi.ml
[@@@alert "-unstable"]

type message =
  | Initial
  | Ready
  | Overhead of int array
  | Read of int array
  | Write of int array
  | Stop

let line = 128 / 8
let micros time = Float.to_int (time *. 1000000.0)

let rec overhead sum i n =
  if i < n then overhead (sum lor i) (i + (line * 8)) n else sum

let rec read cache sum i n =
  if i < n then
    let sum = sum lor Array.unsafe_get cache (i + (0 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (1 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (2 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (3 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (4 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (5 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (6 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (7 * line)) in
    read cache sum (i + (line * 8)) n
  else sum

let rec write cache i n =
  if i < n then (
    Array.unsafe_set cache (i + (0 * line)) 0;
    Array.unsafe_set cache (i + (1 * line)) 0;
    Array.unsafe_set cache (i + (2 * line)) 0;
    Array.unsafe_set cache (i + (3 * line)) 0;
    Array.unsafe_set cache (i + (4 * line)) 0;
    Array.unsafe_set cache (i + (5 * line)) 0;
    Array.unsafe_set cache (i + (6 * line)) 0;
    Array.unsafe_set cache (i + (7 * line)) 0;
    write cache (i + (line * 8)) n)

let worker () =
  let timing = Multicore_magic.copy_as_padded @@ Atomic.make 0 in
  let ch = Multicore_magic.copy_as_padded @@ Atomic.make @@ Initial in
  let atomic = Multicore_magic.copy_as_padded @@ Atomic.make 0 in
  let domain =
    Domain.spawn @@ fun () ->
    while
      match Atomic.get ch with
      | Initial ->
          Atomic.set ch Ready;
          true
      | Ready -> true
      | Overhead cache ->
          let n = Multicore_magic.length_of_padded_array cache in
          Multicore_magic.fence atomic;
          let start = micros @@ Unix.gettimeofday () in
          let sum = overhead 0 0 n in
          Multicore_magic.fence atomic;
          let stop = micros @@ Unix.gettimeofday () in
          if sum <> 0 then (
            Atomic.set timing (stop - start);
            Atomic.set ch Ready);
          true
      | Read cache ->
          let n = Multicore_magic.length_of_padded_array cache in
          Multicore_magic.fence atomic;
          let start = micros @@ Unix.gettimeofday () in
          let sum = read cache 0 0 n in
          Multicore_magic.fence atomic;
          let stop = micros @@ Unix.gettimeofday () in
          if sum = 0 then (
            Atomic.set timing (stop - start);
            Atomic.set ch Ready);
          true
      | Write cache ->
          let n = Multicore_magic.length_of_padded_array cache in
          Multicore_magic.fence atomic;
          let start = micros @@ Unix.gettimeofday () in
          write cache 0 n;
          Multicore_magic.fence atomic;
          let stop = micros @@ Unix.gettimeofday () in
          Atomic.set timing (stop - start);
          Atomic.set ch Ready;
          true
      | Stop -> false
    do
      ()
    done
  in
  Multicore_magic.copy_as_padded (ch, timing, domain)

let rec ready ((ch, _, _) as wr) = if Atomic.get ch != Ready then ready wr

let stop (ch, _, domain) =
  Atomic.set ch Stop;
  Domain.join domain

let atomic = Multicore_magic.copy_as_padded @@ Atomic.make 0

let timing (ch, timing, _) message =
  Multicore_magic.fence atomic;
  Atomic.set ch message;
  while Atomic.get ch != Ready do
    ()
  done;
  Atomic.get timing

let run wr message = timing wr message |> ignore

let rec accumulate sum count timing =
  if 0 < count then accumulate (sum + timing ()) (count - 1) timing else sum

let measure overhead timing = accumulate 0 100000 timing - overhead
let stars x = String.make (Int.max 0 (Float.to_int (Float.round x))) '*'

let print_measurement name x min =
  let x = Float.of_int x /. Float.of_int min in
  Printf.printf "  %s %7.2f %s\n" name x (stars x)

type results = {
  mutable read_m : int;
  mutable write_m : int;
  mutable read_s : int;
  mutable write_s : int;
  mutable read_i_s : int;
  mutable write_i_s : int;
  mutable read_i_m : int;
  mutable write_i_m : int;
}

let results () =
  {
    read_m = 0;
    write_m = 0;
    read_s = 0;
    write_s = 0;
    read_i_s = 0;
    write_i_s = 0;
    read_i_m = 0;
    write_i_m = 0;
  }

let () =
  let worker_1 = worker () and worker_2 = worker () and worker_3 = worker () in

  ready worker_1;
  ready worker_2;
  ready worker_3;

  let cache_in_words = 100 * 1024 / 8 in
  let cache = Multicore_magic.make_padded_array cache_in_words 0 in

  let overhead = Multicore_magic.copy_as_padded @@ Overhead cache
  and read = Multicore_magic.copy_as_padded @@ Read cache
  and write = Multicore_magic.copy_as_padded @@ Write cache in

  let results = [| results (); results (); results () |] in

  let read_m () =
    run worker_2 read;
    run worker_3 read;
    run worker_1 write;
    timing worker_1 read
  and write_m () =
    run worker_2 read;
    run worker_3 read;
    run worker_1 write;
    timing worker_1 write
  and read_s () =
    run worker_1 read;
    run worker_2 read;
    run worker_3 read;
    timing worker_1 read
  and write_s () =
    run worker_1 read;
    run worker_2 read;
    run worker_3 read;
    timing worker_1 write
  and read_i_s () =
    run worker_1 read;
    run worker_2 write;
    run worker_3 read;
    timing worker_1 read
  and write_i_s () =
    run worker_1 read;
    run worker_2 write;
    run worker_3 read;
    timing worker_1 write
  and read_i_m () =
    run worker_1 read;
    run worker_2 read;
    run worker_3 write;
    timing worker_1 read
  and write_i_m () =
    run worker_1 read;
    run worker_2 read;
    run worker_3 write;
    timing worker_1 write
  in

  let overhead () = timing worker_1 overhead in

  Gc.full_major ();

  let _warmup = measure 0 overhead in

  let overhead = measure 0 overhead in

  for i = 0 to Array.length results - 1 do
    let results = results.(i) in

    results.read_m <- measure overhead read_m;
    results.write_m <- measure overhead write_m;
    results.read_s <- measure overhead read_s;
    results.write_s <- measure overhead write_s;
    results.read_i_s <- measure overhead read_i_s;
    results.write_i_s <- measure overhead write_i_s;
    results.read_i_m <- measure overhead read_i_m;
    results.write_i_m <- measure overhead write_i_m
  done;

  Gc.print_stat stdout;
  Printf.printf "\n";

  stop worker_1;
  stop worker_2;
  stop worker_3;

  results
  |> Array.iter
     @@ fun {
              read_m;
              write_m;
              read_s;
              write_s;
              read_i_s;
              write_i_s;
              read_i_m;
              write_i_m;
            } ->
     let min =
       List.fold_left Int.min read_m
         [ write_m; read_s; write_s; read_i_m; write_i_m; read_i_s; write_i_s ]
     in
     Printf.printf "Relative to fastest:\n\n";
     print_measurement "Read  M: " read_m min;
     print_measurement "Write M: " write_m min;
     Printf.printf "\n";
     print_measurement "Read  S: " read_s min;
     print_measurement "Write S: " write_s min;
     Printf.printf "\n";
     print_measurement "Read  Is:" read_i_s min;
     print_measurement "Write Is:" write_i_s min;
     Printf.printf "\n";
     print_measurement "Read  Im:" read_i_m min;
     print_measurement "Write Im:" write_i_m min;
     Printf.printf "\n";
     flush stdout
	[@@@alert "-unstable"]

	type message =
	\| Initial
	\| Ready
	\| Overhead of int array
	\| Read of int array
	\| Write of int array
	\| Stop

	let line = 128 / 8
	let micros time = Float.to_int (time *. 1000000.0)

	let rec overhead sum i n =
	if i < n then overhead (sum lor i) (i + (line * 8)) n else sum

	let rec read cache sum i n =
	if i < n then
	let sum = sum lor Array.unsafe_get cache (i + (0 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (1 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (2 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (3 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (4 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (5 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (6 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (7 * line)) in
	read cache sum (i + (line * 8)) n
	else sum

	let rec write cache i n =
	if i < n then (
	Array.unsafe_set cache (i + (0 * line)) 0;
	Array.unsafe_set cache (i + (1 * line)) 0;
	Array.unsafe_set cache (i + (2 * line)) 0;
	Array.unsafe_set cache (i + (3 * line)) 0;
	Array.unsafe_set cache (i + (4 * line)) 0;
	Array.unsafe_set cache (i + (5 * line)) 0;
	Array.unsafe_set cache (i + (6 * line)) 0;
	Array.unsafe_set cache (i + (7 * line)) 0;
	write cache (i + (line * 8)) n)

	let worker () =
	let timing = Multicore_magic.copy_as_padded @@ Atomic.make 0 in
	let ch = Multicore_magic.copy_as_padded @@ Atomic.make @@ Initial in
	let atomic = Multicore_magic.copy_as_padded @@ Atomic.make 0 in
	let domain =
	Domain.spawn @@ fun () ->
	while
	match Atomic.get ch with
	\| Initial ->
	Atomic.set ch Ready;
	true
	\| Ready -> true
	\| Overhead cache ->
	let n = Multicore_magic.length_of_padded_array cache in
	Multicore_magic.fence atomic;
	let start = micros @@ Unix.gettimeofday () in
	let sum = overhead 0 0 n in
	Multicore_magic.fence atomic;
	let stop = micros @@ Unix.gettimeofday () in
	if sum <> 0 then (
	Atomic.set timing (stop - start);
	Atomic.set ch Ready);
	true
	\| Read cache ->
	let n = Multicore_magic.length_of_padded_array cache in
	Multicore_magic.fence atomic;
	let start = micros @@ Unix.gettimeofday () in
	let sum = read cache 0 0 n in
	Multicore_magic.fence atomic;
	let stop = micros @@ Unix.gettimeofday () in
	if sum = 0 then (
	Atomic.set timing (stop - start);
	Atomic.set ch Ready);
	true
	\| Write cache ->
	let n = Multicore_magic.length_of_padded_array cache in
	Multicore_magic.fence atomic;
	let start = micros @@ Unix.gettimeofday () in
	write cache 0 n;
	Multicore_magic.fence atomic;
	let stop = micros @@ Unix.gettimeofday () in
	Atomic.set timing (stop - start);
	Atomic.set ch Ready;
	true
	\| Stop -> false
	do
	()
	done
	in
	Multicore_magic.copy_as_padded (ch, timing, domain)

	let rec ready ((ch, _, _) as wr) = if Atomic.get ch != Ready then ready wr

	let stop (ch, _, domain) =
	Atomic.set ch Stop;
	Domain.join domain

	let atomic = Multicore_magic.copy_as_padded @@ Atomic.make 0

	let timing (ch, timing, _) message =
	Multicore_magic.fence atomic;
	Atomic.set ch message;
	while Atomic.get ch != Ready do
	()
	done;
	Atomic.get timing

	let run wr message = timing wr message \|> ignore

	let rec accumulate sum count timing =
	if 0 < count then accumulate (sum + timing ()) (count - 1) timing else sum

	let measure overhead timing = accumulate 0 100000 timing - overhead
	let stars x = String.make (Int.max 0 (Float.to_int (Float.round x))) '*'

	let print_measurement name x min =
	let x = Float.of_int x /. Float.of_int min in
	Printf.printf " %s %7.2f %s\n" name x (stars x)

	type results = {
	mutable read_m : int;
	mutable write_m : int;
	mutable read_s : int;
	mutable write_s : int;
	mutable read_i_s : int;
	mutable write_i_s : int;
	mutable read_i_m : int;
	mutable write_i_m : int;
	}

	let results () =
	{
	read_m = 0;
	write_m = 0;
	read_s = 0;
	write_s = 0;
	read_i_s = 0;
	write_i_s = 0;
	read_i_m = 0;
	write_i_m = 0;
	}

	let () =
	let worker_1 = worker () and worker_2 = worker () and worker_3 = worker () in

	ready worker_1;
	ready worker_2;
	ready worker_3;

	let cache_in_words = 100 * 1024 / 8 in
	let cache = Multicore_magic.make_padded_array cache_in_words 0 in

	let overhead = Multicore_magic.copy_as_padded @@ Overhead cache
	and read = Multicore_magic.copy_as_padded @@ Read cache
	and write = Multicore_magic.copy_as_padded @@ Write cache in

	let results = [\| results (); results (); results () \|] in

	let read_m () =
	run worker_2 read;
	run worker_3 read;
	run worker_1 write;
	timing worker_1 read
	and write_m () =
	run worker_2 read;
	run worker_3 read;
	run worker_1 write;
	timing worker_1 write
	and read_s () =
	run worker_1 read;
	run worker_2 read;
	run worker_3 read;
	timing worker_1 read
	and write_s () =
	run worker_1 read;
	run worker_2 read;
	run worker_3 read;
	timing worker_1 write
	and read_i_s () =
	run worker_1 read;
	run worker_2 write;
	run worker_3 read;
	timing worker_1 read
	and write_i_s () =
	run worker_1 read;
	run worker_2 write;
	run worker_3 read;
	timing worker_1 write
	and read_i_m () =
	run worker_1 read;
	run worker_2 read;
	run worker_3 write;
	timing worker_1 read
	and write_i_m () =
	run worker_1 read;
	run worker_2 read;
	run worker_3 write;
	timing worker_1 write
	in

	let overhead () = timing worker_1 overhead in

	Gc.full_major ();

	let _warmup = measure 0 overhead in

	let overhead = measure 0 overhead in

	for i = 0 to Array.length results - 1 do
	let results = results.(i) in

	results.read_m <- measure overhead read_m;
	results.write_m <- measure overhead write_m;
	results.read_s <- measure overhead read_s;
	results.write_s <- measure overhead write_s;
	results.read_i_s <- measure overhead read_i_s;
	results.write_i_s <- measure overhead write_i_s;
	results.read_i_m <- measure overhead read_i_m;
	results.write_i_m <- measure overhead write_i_m
	done;

	Gc.print_stat stdout;
	Printf.printf "\n";

	stop worker_1;
	stop worker_2;
	stop worker_3;

	results
	\|> Array.iter
	@@ fun {
	read_m;
	write_m;
	read_s;
	write_s;
	read_i_s;
	write_i_s;
	read_i_m;
	write_i_m;
	} ->
	let min =
	List.fold_left Int.min read_m
	[ write_m; read_s; write_s; read_i_m; write_i_m; read_i_s; write_i_s ]
	in
	Printf.printf "Relative to fastest:\n\n";
	print_measurement "Read M: " read_m min;
	print_measurement "Write M: " write_m min;
	Printf.printf "\n";
	print_measurement "Read S: " read_s min;
	print_measurement "Write S: " write_s min;
	Printf.printf "\n";
	print_measurement "Read Is:" read_i_s min;
	print_measurement "Write Is:" write_i_s min;
	Printf.printf "\n";
	print_measurement "Read Im:" read_i_m min;
	print_measurement "Write Im:" write_i_m min;
	Printf.printf "\n";
	flush stdout