katox/smt.clj

## smt.clj
;;
;; http://johnj.com/from-elegance-to-speed.html
;;
;; The Clojure version
;; -------------------

(ns smt
  (:require [net.cgrand.xforms :as x]
            [criterium.core :as c :refer [quick-bench]]))

(set! *unchecked-math* :warn-on-boxed)

(def ^:const TIMES-LENGTH (long 1e6))

(defmacro benchmark* [expr-name expr len]
  `(let [result# (c/quick-benchmark ~expr {})
         mean# (double (first (:mean result#)))]
     (println "Benchmarking:" ~expr-name)
     (c/report-result result#)
     (println (format "Processing rate: %.03f MHz" (/ ~len mean# 1e6)))))

(defmacro benchmark [expr-name expr]
  `(benchmark* ~expr-name ~expr ~TIMES-LENGTH))


; The original blog post Clojure code
(defn smt-8 [times]
  (->> times
       (partition 8 1)
       (map (juxt identity
                  (comp (partial apply -)
                        (juxt last first))))
       (filter (comp (partial > 1000) second))))

(benchmark
  "Original smt-8 code"
  (count
    (let [times (iterate #(+ % (rand-int 1000)) 0)]
      (smt-8 (take TIMES-LENGTH times)))))
; Benchmarking: Original smt-8 code
; Evaluation count : 6 in 6 samples of 1 calls.
; Execution time mean : 2.273346 sec
; Execution time std-deviation : 15.132003 ms
; Execution time lower quantile : 2.257620 sec ( 2.5%)
; Execution time upper quantile : 2.293020 sec (97.5%)
; Overhead used : 8.159543 ns
; Processing rate: 0.440 MHz


; First steps
; ------------
; "The first step is to translate our code as is to Common Lisp. We’re going
; to punt entirely on the question of laziness for this post"

; So let's look at the lazy randomly increasing sequence
(benchmark
  "Time series lazy seq created by `iterate`"
  (doall (take TIMES-LENGTH (iterate #(+ % (rand-int 1000)) 0))))
; Evaluation count : 6 in 6 samples of 1 calls.
; Execution time mean : 126.536857 ms
; Execution time std-deviation : 1.939898 ms
; Execution time lower quantile : 124.567926 ms ( 2.5%)
; Execution time upper quantile : 128.832302 ms (97.5%)
; Overhead used : 7.977976 ns

; We can get rid of lazyness and put it all into a vector
(defn compute-times [n]
  (let [n (int n)
        v (transient [])]
    (loop [i (int 0)
           y 0
           res v]
      (if (< i n)
        (recur (unchecked-inc-int i)
               (unchecked-add y (long (rand 1000)))
               (conj! res y))
        (persistent! res)))))

(benchmark
  "Time series vector created using `transient`"
  (doall (compute-times TIMES-LENGTH)))
; Evaluation count : 18 in 6 samples of 3 calls.
; Execution time mean : 48.965833 ms
; Execution time std-deviation : 1.640308 ms
; Execution time lower quantile : 46.496836 ms ( 2.5%)
; Execution time upper quantile : 50.608152 ms (97.5%)
; Overhead used : 8.141903 ns

; Or even a simple java array of primitive longs could do
(defn acompute-times ^longs [^long n]
  (let [array (long-array n)]
    (loop [i (int 0)
           y 0]
      (if (< i n)
        (let [y' (unchecked-add y (long (rand 1000)))]
          (aset array i y)
          (recur (unchecked-inc-int i) y'))
        array))))

(benchmark
  "Time series java array created using primitives"
  (doall (acompute-times TIMES-LENGTH)))
; Evaluation count : 24 in 6 samples of 4 calls.
; Execution time mean : 34.014258 ms
; Execution time std-deviation : 461.698023 µs
; Execution time lower quantile : 33.445153 ms ( 2.5%)
; Execution time upper quantile : 34.505905 ms (97.5%)
; Overhead used : 8.140303 ns


; Let the original smt-8 iterate over the precomputed array
(benchmark
  "smt-8 iterating over the array"
  (count
    (smt-8 (acompute-times TIMES-LENGTH))))
; Benchmarking: smt-8 iterating over the array
; Evaluation count : 6 in 6 samples of 1 calls.
; Execution time mean : 2.220764 sec
; Execution time std-deviation : 22.597689 ms
; Execution time lower quantile : 2.190454 sec ( 2.5%)
; Execution time upper quantile : 2.246106 sec (97.5%)
; Overhead used : 8.159543 ns
; Processing rate: 0.450 MHz

; With cosmetic changes to avoid the overuse of comp and juxt
(defn v2-smt-8 [^longs times]
  (->> times
       (partition 8 1)
       (map (fn [x] (let [f (first x)
                          l (last x)]
                      [x (- l f)])))
       (filter (fn [[x diff]] (< diff 1000)))))

(benchmark
  "v2-smt-8 without juxt iterating over the array"
  (count
    (v2-smt-8 (acompute-times TIMES-LENGTH))))
; Benchmarking: v2-smt-8 without juxt iterating over the array
; Evaluation count : 6 in 6 samples of 1 calls.
; Execution time mean : 1.924487 sec
; Execution time std-deviation : 13.311011 ms
; Execution time lower quantile : 1.910717 sec ( 2.5%)
; Execution time upper quantile : 1.939049 sec (97.5%)
; Overhead used : 8.159543 ns
; Processing rate: 0.520 MHz


; But so far the code is still lazy
; we can switch to eager by using transducers
(defn v3-smt-8 [^longs times]
  (sequence
    (comp
      (x/partition 8 1)
      (map (fn [x] (let [f (first x)
                         l (last x)]
                     [x (- l f)])))
      (filter (fn [[x diff]] (< diff 1000))))
    times))

(benchmark
  "v3-smt-8 simplified iterating over the array"
  (count
    (v3-smt-8 (acompute-times TIMES-LENGTH))))
; Benchmarking: v3-smt-8 simplified iterating over the array
; Evaluation count : 6 in 6 samples of 1 calls.
; Execution time mean : 1.056290 sec
; Execution time std-deviation : 11.500128 ms
; Execution time lower quantile : 1.040926 sec ( 2.5%)
; Execution time upper quantile : 1.067729 sec (97.5%)
; Overhead used : 8.159543 ns
; Processing rate: 0.947 MHz


; It looks that more records are discarded than passed
; Let's try to swap `map` and `filter`
(defn v4-smt-8 [^longs times]
  (sequence
    (comp
      (x/partition 8 1)
      (filter (fn [x] (< (- (last x) (first x)) 1000)))
      (map (fn [x] [x (- (last x) (first x))])))
    times))

(benchmark
  "v4-smt-8 reordered `filter` and `map` iterating over the array"
  (count
    (v4-smt-8 (acompute-times TIMES-LENGTH))))
; Benchmarking: v4-smt-8 reordered `filter` and `map` iterating over the array
; Evaluation count : 6 in 6 samples of 1 calls.
; Execution time mean : 1.039309 sec
; Execution time std-deviation : 7.558152 ms
; Execution time lower quantile : 1.030284 sec ( 2.5%)
; Execution time upper quantile : 1.047647 sec (97.5%)
; Overhead used : 8.159543 ns
; Processing rate: 0.962 MHz

; Let's get rid of boxed int math first
(defmacro diff [x]
  `(unchecked-subtract (long (last ~x)) (long (first ~x))))

(defn v5-smt-8 [^longs times]
  (sequence
    (comp
      (x/partition 8 1)
      (filter (fn [x] (< (diff x) 1000)))
      (map (fn [x] [x (diff x)])))
    times))

(benchmark
  "v5-smt-8 using primitive math iterating over the array"
  (count
    (v5-smt-8 (acompute-times TIMES-LENGTH))))
; Benchmarking: v5-smt-8 using primitive math iterating over the array
; Evaluation count : 6 in 6 samples of 1 calls.
; Execution time mean : 1.025017 sec
; Execution time std-deviation : 14.802192 ms
; Execution time lower quantile : 1.006743 sec ( 2.5%)
; Execution time upper quantile : 1.041188 sec (97.5%)
; Overhead used : 8.159543 ns
; Processing rate: 0.976 MHz


; "To improve the algorithm proper, I perform the entire computation in a single
; loop statement. I also change the output slightly to show the index for matching
; 8-fold events, start and end time, and event duration"

; OK, let's do the same low level optimization in clojure
(defn second-try [^longs a]
  (let [n (count a)
        max (int (- n 7))]
    (loop [i 0
           res (transient [])]
      (if (< i max)
        (let [f (aget a i)
              l (aget a (+ i 7))
              diff (unchecked-subtract l f)]
          (if (< diff 1000)
            (recur (unchecked-inc-int i) (conj! res {:index i :startval f :endval l :diff diff}))
            (recur (unchecked-inc-int i) res)))
        (persistent! res)))))

(benchmark
  "second-try single pass primitive loop function"
  (count
    (second-try (acompute-times TIMES-LENGTH))))
; Benchmarking: second-try single pass primitive loop function
; Evaluation count : 30 in 6 samples of 5 calls.
; Execution time mean : 23.821902 ms
; Execution time std-deviation : 331.676247 µs
; Execution time lower quantile : 23.518765 ms ( 2.5%)
; Execution time upper quantile : 24.198100 ms (97.5%)
; Overhead used : 8.013372 ns
; Processing rate: 41.978 MHz


; let's verify we get the same result
(let [times (acompute-times TIMES-LENGTH)
      original (smt-8 times)
      functional (v5-smt-8 times)
      optimized (second-try times)
      describe-subseq (fn [[v diff]] {:startval (first v) :endval (last v) :diff diff})
      results-equal (= (map describe-subseq original)
                       (map describe-subseq functional)
                       (map #(dissoc % :index) optimized))]
  (println "Computation results equal:" results-equal)
  results-equal)
; => true

; In the blog post section "Improvements" measurements are done on the function itself
; on a fixed 100x longer precomputed sequence
(let [BIG-LENGTH ^:const (* 100 TIMES-LENGTH)
      times (acompute-times BIG-LENGTH)]
  (benchmark*
    "second-try on a huge precomputed array"
    (count
      (second-try times))
    BIG-LENGTH))
; Benchmarking: second-try on a huge precomputed array
; Evaluation count : 6 in 6 samples of 1 calls.
; Execution time mean : 167.110500 ms
; Execution time std-deviation : 3.097737 ms
; Execution time lower quantile : 163.810594 ms ( 2.5%)
; Execution time upper quantile : 171.506338 ms (97.5%)
; Overhead used : 8.140743 ns
; Processing rate: 598.406 MHz
	;;
	;; http://johnj.com/from-elegance-to-speed.html
	;;
	;; The Clojure version
	;; -------------------

	(ns smt
	(:require [net.cgrand.xforms :as x]
	[criterium.core :as c :refer [quick-bench]]))

	(set! unchecked-math :warn-on-boxed)

	(def ^:const TIMES-LENGTH (long 1e6))

	(defmacro benchmark* [expr-name expr len]
	`(let [result# (c/quick-benchmark ~expr {})
	mean# (double (first (:mean result#)))]
	(println "Benchmarking:" ~expr-name)
	(c/report-result result#)
	(println (format "Processing rate: %.03f MHz" (/ ~len mean# 1e6)))))

	(defmacro benchmark [expr-name expr]
	`(benchmark* ~expr-name ~expr ~TIMES-LENGTH))


	; The original blog post Clojure code
	(defn smt-8 [times]
	(->> times
	(partition 8 1)
	(map (juxt identity
	(comp (partial apply -)
	(juxt last first))))
	(filter (comp (partial > 1000) second))))

	(benchmark
	"Original smt-8 code"
	(count
	(let [times (iterate #(+ % (rand-int 1000)) 0)]
	(smt-8 (take TIMES-LENGTH times)))))
	; Benchmarking: Original smt-8 code
	; Evaluation count : 6 in 6 samples of 1 calls.
	; Execution time mean : 2.273346 sec
	; Execution time std-deviation : 15.132003 ms
	; Execution time lower quantile : 2.257620 sec ( 2.5%)
	; Execution time upper quantile : 2.293020 sec (97.5%)
	; Overhead used : 8.159543 ns
	; Processing rate: 0.440 MHz


	; First steps
	; ------------
	; "The first step is to translate our code as is to Common Lisp. We’re going
	; to punt entirely on the question of laziness for this post"

	; So let's look at the lazy randomly increasing sequence
	(benchmark
	"Time series lazy seq created by `iterate`"
	(doall (take TIMES-LENGTH (iterate #(+ % (rand-int 1000)) 0))))
	; Evaluation count : 6 in 6 samples of 1 calls.
	; Execution time mean : 126.536857 ms
	; Execution time std-deviation : 1.939898 ms
	; Execution time lower quantile : 124.567926 ms ( 2.5%)
	; Execution time upper quantile : 128.832302 ms (97.5%)
	; Overhead used : 7.977976 ns

	; We can get rid of lazyness and put it all into a vector
	(defn compute-times [n]
	(let [n (int n)
	v (transient [])]
	(loop [i (int 0)
	y 0
	res v]
	(if (< i n)
	(recur (unchecked-inc-int i)
	(unchecked-add y (long (rand 1000)))
	(conj! res y))
	(persistent! res)))))

	(benchmark
	"Time series vector created using `transient`"
	(doall (compute-times TIMES-LENGTH)))
	; Evaluation count : 18 in 6 samples of 3 calls.
	; Execution time mean : 48.965833 ms
	; Execution time std-deviation : 1.640308 ms
	; Execution time lower quantile : 46.496836 ms ( 2.5%)
	; Execution time upper quantile : 50.608152 ms (97.5%)
	; Overhead used : 8.141903 ns

	; Or even a simple java array of primitive longs could do
	(defn acompute-times ^longs [^long n]
	(let [array (long-array n)]
	(loop [i (int 0)
	y 0]
	(if (< i n)
	(let [y' (unchecked-add y (long (rand 1000)))]
	(aset array i y)
	(recur (unchecked-inc-int i) y'))
	array))))

	(benchmark
	"Time series java array created using primitives"
	(doall (acompute-times TIMES-LENGTH)))
	; Evaluation count : 24 in 6 samples of 4 calls.
	; Execution time mean : 34.014258 ms
	; Execution time std-deviation : 461.698023 µs
	; Execution time lower quantile : 33.445153 ms ( 2.5%)
	; Execution time upper quantile : 34.505905 ms (97.5%)
	; Overhead used : 8.140303 ns


	; Let the original smt-8 iterate over the precomputed array
	(benchmark
	"smt-8 iterating over the array"
	(count
	(smt-8 (acompute-times TIMES-LENGTH))))
	; Benchmarking: smt-8 iterating over the array
	; Evaluation count : 6 in 6 samples of 1 calls.
	; Execution time mean : 2.220764 sec
	; Execution time std-deviation : 22.597689 ms
	; Execution time lower quantile : 2.190454 sec ( 2.5%)
	; Execution time upper quantile : 2.246106 sec (97.5%)
	; Overhead used : 8.159543 ns
	; Processing rate: 0.450 MHz

	; With cosmetic changes to avoid the overuse of comp and juxt
	(defn v2-smt-8 [^longs times]
	(->> times
	(partition 8 1)
	(map (fn [x] (let [f (first x)
	l (last x)]
	[x (- l f)])))
	(filter (fn [[x diff]] (< diff 1000)))))

	(benchmark
	"v2-smt-8 without juxt iterating over the array"
	(count
	(v2-smt-8 (acompute-times TIMES-LENGTH))))
	; Benchmarking: v2-smt-8 without juxt iterating over the array
	; Evaluation count : 6 in 6 samples of 1 calls.
	; Execution time mean : 1.924487 sec
	; Execution time std-deviation : 13.311011 ms
	; Execution time lower quantile : 1.910717 sec ( 2.5%)
	; Execution time upper quantile : 1.939049 sec (97.5%)
	; Overhead used : 8.159543 ns
	; Processing rate: 0.520 MHz


	; But so far the code is still lazy
	; we can switch to eager by using transducers
	(defn v3-smt-8 [^longs times]
	(sequence
	(comp
	(x/partition 8 1)
	(map (fn [x] (let [f (first x)
	l (last x)]
	[x (- l f)])))
	(filter (fn [[x diff]] (< diff 1000))))
	times))

	(benchmark
	"v3-smt-8 simplified iterating over the array"
	(count
	(v3-smt-8 (acompute-times TIMES-LENGTH))))
	; Benchmarking: v3-smt-8 simplified iterating over the array
	; Evaluation count : 6 in 6 samples of 1 calls.
	; Execution time mean : 1.056290 sec
	; Execution time std-deviation : 11.500128 ms
	; Execution time lower quantile : 1.040926 sec ( 2.5%)
	; Execution time upper quantile : 1.067729 sec (97.5%)
	; Overhead used : 8.159543 ns
	; Processing rate: 0.947 MHz


	; It looks that more records are discarded than passed
	; Let's try to swap `map` and `filter`
	(defn v4-smt-8 [^longs times]
	(sequence
	(comp
	(x/partition 8 1)
	(filter (fn [x] (< (- (last x) (first x)) 1000)))
	(map (fn [x] [x (- (last x) (first x))])))
	times))

	(benchmark
	"v4-smt-8 reordered `filter` and `map` iterating over the array"
	(count
	(v4-smt-8 (acompute-times TIMES-LENGTH))))
	; Benchmarking: v4-smt-8 reordered `filter` and `map` iterating over the array
	; Evaluation count : 6 in 6 samples of 1 calls.
	; Execution time mean : 1.039309 sec
	; Execution time std-deviation : 7.558152 ms
	; Execution time lower quantile : 1.030284 sec ( 2.5%)
	; Execution time upper quantile : 1.047647 sec (97.5%)
	; Overhead used : 8.159543 ns
	; Processing rate: 0.962 MHz

	; Let's get rid of boxed int math first
	(defmacro diff [x]
	`(unchecked-subtract (long (last ~x)) (long (first ~x))))

	(defn v5-smt-8 [^longs times]
	(sequence
	(comp
	(x/partition 8 1)
	(filter (fn [x] (< (diff x) 1000)))
	(map (fn [x] [x (diff x)])))
	times))

	(benchmark
	"v5-smt-8 using primitive math iterating over the array"
	(count
	(v5-smt-8 (acompute-times TIMES-LENGTH))))
	; Benchmarking: v5-smt-8 using primitive math iterating over the array
	; Evaluation count : 6 in 6 samples of 1 calls.
	; Execution time mean : 1.025017 sec
	; Execution time std-deviation : 14.802192 ms
	; Execution time lower quantile : 1.006743 sec ( 2.5%)
	; Execution time upper quantile : 1.041188 sec (97.5%)
	; Overhead used : 8.159543 ns
	; Processing rate: 0.976 MHz


	; "To improve the algorithm proper, I perform the entire computation in a single
	; loop statement. I also change the output slightly to show the index for matching
	; 8-fold events, start and end time, and event duration"

	; OK, let's do the same low level optimization in clojure
	(defn second-try [^longs a]
	(let [n (count a)
	max (int (- n 7))]
	(loop [i 0
	res (transient [])]
	(if (< i max)
	(let [f (aget a i)
	l (aget a (+ i 7))
	diff (unchecked-subtract l f)]
	(if (< diff 1000)
	(recur (unchecked-inc-int i) (conj! res {:index i :startval f :endval l :diff diff}))
	(recur (unchecked-inc-int i) res)))
	(persistent! res)))))

	(benchmark
	"second-try single pass primitive loop function"
	(count
	(second-try (acompute-times TIMES-LENGTH))))
	; Benchmarking: second-try single pass primitive loop function
	; Evaluation count : 30 in 6 samples of 5 calls.
	; Execution time mean : 23.821902 ms
	; Execution time std-deviation : 331.676247 µs
	; Execution time lower quantile : 23.518765 ms ( 2.5%)
	; Execution time upper quantile : 24.198100 ms (97.5%)
	; Overhead used : 8.013372 ns
	; Processing rate: 41.978 MHz


	; let's verify we get the same result
	(let [times (acompute-times TIMES-LENGTH)
	original (smt-8 times)
	functional (v5-smt-8 times)
	optimized (second-try times)
	describe-subseq (fn [[v diff]] {:startval (first v) :endval (last v) :diff diff})
	results-equal (= (map describe-subseq original)
	(map describe-subseq functional)
	(map #(dissoc % :index) optimized))]
	(println "Computation results equal:" results-equal)
	results-equal)
	; => true

	; In the blog post section "Improvements" measurements are done on the function itself
	; on a fixed 100x longer precomputed sequence
	(let [BIG-LENGTH ^:const (* 100 TIMES-LENGTH)
	times (acompute-times BIG-LENGTH)]
	(benchmark*
	"second-try on a huge precomputed array"
	(count
	(second-try times))
	BIG-LENGTH))
	; Benchmarking: second-try on a huge precomputed array
	; Evaluation count : 6 in 6 samples of 1 calls.
	; Execution time mean : 167.110500 ms
	; Execution time std-deviation : 3.097737 ms
	; Execution time lower quantile : 163.810594 ms ( 2.5%)
	; Execution time upper quantile : 171.506338 ms (97.5%)
	; Overhead used : 8.140743 ns
	; Processing rate: 598.406 MHz