EduardoRFS/no_free_lunch.md

## no_free_lunch.md

      
    Raw
  

              no_free_lunch.md
            
          
    Bend vs Reality

TLDR: I understand the proposal of Bend, but when the efficiency performance is so big that a RTX 4090 is only 7x faster on a near-optimal scenario than 2 cores of a M3 Max in a language like JavaScript, you should probably not take it.
On the otherhand, easy to use, but opt-in, parallel languages such as OCaml exists and it can compete, so you should likely take it. If you need even more performance, Rust could likely beat the RTX 4090 results on a mobile CPU.
Of course future optimizations should improve Bend results, but my goal here is to show that the current results are not as impressive as they may look, likely a JIT will make the RTX 4090 results 10x faster, but keep always in mind, a RTX 4090 still uses at least 100 times more power than a single M3 core at any instant and that in principle GPUs are better for purely parallel tasks.
Also keep in mind that this is a very friendly code to parallelism, this is both against Bend and in favour of it, most real code is not pure and not a purely binary algorithm, so making most code parallel is not as easy, but that also limits how much magic Bend can do.
The code is based on the original examples at https://gist.github.com/VictorTaelin/c30f2c37344b209be3896afcde480b4f.
I don't have a RTX 4090 in hands but the data is around.


source
hardware
time
multiplier


bitonic.bend
Victor's RTX 4090
0.58s
1.0x


bitonic.js
Victor's M3 Max
3.59s
0.16x


bitonic_ml_p.ml
Antonio's M3 Max
0.86s
0.67x


bitonic_p.ml
Antonio's M3 Max
1.18s
0.49x


bitonic.ml
Antonio's M3 Max
2.89s
0.20x


Idiomatic and parallel

This took me about an hour of playing, it is pretty readable, in my opinion better than the original code overall and the RTX 4090 is only 1.5x faster.
type tree = Leaf of int | Both of tree * tree

let rec gen d x =
  match d with
  | 0 -> Leaf (x mod 0x1000000)
  | _ -> Both (gen (d - 1) ((x * 2) + 1), gen (d - 1) (x * 2))

let rec sum t =
  match t with Leaf v -> v | Both (a, b) -> (sum a + sum b) mod 0x1000000

let swap s a b = match s with 0 -> (a, b) | _ -> (b, a)

let rec warp s a b =
  match (a, b) with
  | Leaf aa, Leaf bb -> swap (s lxor if aa > bb then 1 else 0) a b
  | Both (aa, ab), Both (ba, bb) ->
      let wAa, wAb = warp s aa ba in
      let wBa, wBb = warp s ab bb in
      (Both (wAa, wBa), Both (wAb, wBb))
  | _ -> assert false

let warp s a b =
  let a, b = warp s a b in
  Both (a, b)

let rec flow s t =
  match t with Leaf _ -> t | Both (a, b) -> down s (warp s a b)

and down s t =
  match t with Leaf _ -> t | Both (a, b) -> Both (flow s a, flow s b)

let rec sort s t =
  match t with Leaf _ -> t | Both (a, b) -> flow s (Both (sort 0 a, sort 1 b))

let rec sort_p c s t =
  match c < 2 with
  | true -> sort s t
  | false -> (
      match t with
      | Leaf _ -> t
      | Both (a, b) ->
          let left = Domain.spawn @@ fun () -> sort_p (c / 2) 0 a in
          let right = Domain.spawn @@ fun () -> sort_p (c / 2) 1 b in
          flow s (Both (Domain.join left, Domain.join right)))

let main () =
  let domains = 8 in
  sum (sort_p domains 0 (gen 20 0))

let () = Format.eprintf "%d\n%!" (main ())
Parallel one to one translation

This one is literally a translation with multicore on sort and it is and Bend on the RTX 4090 is only 2x faster.
Took me less than 20 min to add the parallel branches needed and this reflects my experience to adding parallelism to pure programs.
type tree = Leaf of int | Both of tree * tree

let rec gen d x =
  match d with
  | 0 -> Leaf (x mod 0x1000000)
  | _ -> Both (gen (d - 1) ((x * 2) + 1), gen (d - 1) (x * 2))

let unpack_leaf t = match t with Leaf t -> t | Both _ -> assert false
let unpack_both t = match t with Leaf _ -> assert false | Both (a, b) -> (a, b)

let rec sum d t =
  match d with
  | 0 -> unpack_leaf t
  | _ ->
      let a, b = unpack_both t in
      (sum (d - 1) a + sum (d - 1) b) mod 0x1000000

let swap s a b = match s with 0 -> Both (a, b) | _ -> Both (b, a)

let rec warp d s a b =
  match d with
  | 0 -> swap (s lxor if a > b then 1 else 0) a b
  | _ ->
      let aa, ab = unpack_both a in
      let ba, bb = unpack_both b in
      let wAa, wAb = unpack_both @@ warp (d - 1) s aa ba in
      let wBa, wBb = unpack_both @@ warp (d - 1) s ab bb in
      Both (Both (wAa, wBa), Both (wAb, wBb))

let rec flow d s t =
  match d with
  | 0 -> t
  | _ ->
      let a, b = unpack_both t in
      down d s (warp (d - 1) s a b)

and down d s t =
  match d with
  | 0 -> t
  | _ ->
      let a, b = unpack_both t in
      Both (flow (d - 1) s a, flow (d - 1) s b)

let rec sort d s t =
  match d with
  | 0 -> t
  | _ ->
      let a, b = unpack_both t in
      flow d s (Both (sort (d - 1) 0 a, sort (d - 1) 1 b))

let rec sort_p c d s t =
  match c < 2 with
  | true -> sort d s t
  | false -> (
      match d with
      | 0 -> t
      | _ ->
          let a, b = unpack_both t in
          let left = Domain.spawn @@ fun () -> sort_p (c / 2) (d - 1) 0 a in
          let right = Domain.spawn @@ fun () -> sort_p (c / 2) (d - 1) 1 b in
          flow d s (Both (Domain.join left, Domain.join right)))

let main () =
  let domains = 8 in
  sum 20 (sort_p domains 20 0 (gen 20 0))

let () = Format.eprintf "%d\n%!" (main ())
One to one translation

This one is literally a translation without any improvements and it is considerably faster than the JS version, only 5x slower than Bend on the RTX 4090.
Took me less than 20min to get to this point.
type tree = Leaf of int | Both of tree * tree

let rec gen d x =
  match d with
  | 0 -> Leaf (x mod 0x1000000)
  | _ -> Both (gen (d - 1) ((x * 2) + 1), gen (d - 1) (x * 2))

let unpack_leaf t = match t with Leaf t -> t | Both _ -> assert false
let unpack_both t = match t with Leaf _ -> assert false | Both (a, b) -> (a, b)

let rec sum d t =
  match d with
  | 0 -> unpack_leaf t
  | _ ->
      let a, b = unpack_both t in
      (sum (d - 1) a + sum (d - 1) b) mod 0x1000000

let swap s a b = match s with 0 -> Both (a, b) | _ -> Both (b, a)

let rec warp d s a b =
  match d with
  | 0 -> swap (s lxor if a > b then 1 else 0) a b
  | _ ->
      let aa, ab = unpack_both a in
      let ba, bb = unpack_both b in
      let wAa, wAb = unpack_both @@ warp (d - 1) s aa ba in
      let wBa, wBb = unpack_both @@ warp (d - 1) s ab bb in
      Both (Both (wAa, wBa), Both (wAb, wBb))

let rec flow d s t =
  match d with
  | 0 -> t
  | _ ->
      let a, b = unpack_both t in
      down d s (warp (d - 1) s a b)

and down d s t =
  match d with
  | 0 -> t
  | _ ->
      let a, b = unpack_both t in
      Both (flow (d - 1) s a, flow (d - 1) s b)

let rec sort d s t =
  match d with
  | 0 -> t
  | _ ->
      let a, b = unpack_both t in
      flow d s (Both (sort (d - 1) 0 a, sort (d - 1) 1 b))

let main () = sum 20 (sort 20 0 (gen 20 0))
let () = Format.eprintf "%d\n%!" (main ())
source	hardware	time	multiplier
bitonic.bend	Victor's RTX 4090	0.58s	1.0x
bitonic.js	Victor's M3 Max	3.59s	0.16x
bitonic_ml_p.ml	Antonio's M3 Max	0.86s	0.67x
bitonic_p.ml	Antonio's M3 Max	1.18s	0.49x
bitonic.ml	Antonio's M3 Max	2.89s	0.20x