joinr/correlation.clj

## correlation.clj

;;These mods pulled runtime down about 37x without going into primitive arrays,
;;and retaining a good bit of the original code.

;;Intermediate mods that got us down about 12x, keeping fairly clost to the original.
#_(defn split-adjacent [f xs]
  (let [prev (atom nil)
        res  (reduce (fn [acc x]
                       (if (f x)
                         (reset! prev x)
                         (reduced x))) nil xs)]
    [@prev res]))

#_(defn interpolate-point
  "Given a vector V1 of [X1 Y1] pairs and a vector of X2 values
  this returns an [X2 Y2} values, Where the Y2's are interpolated linearly"
  [x2 v1]
  (let [[preceeding-point following-point]
        (split-adjacent #(<= ^double (nth % 0) x2) v1)]
    (if (or (nil? preceeding-point)
            (nil? following-point))
      nil
      (/ (+ (nth preceeding-point 1)
            (nth following-point  1))
         2.0))))

#_(defn interpolate-points
  "Given a vector of X2 values and a vector V1 of [X1 Y1] pairs
  this returns an [X2 Y2] values, Where the Y2's are interpolated linearly"
  [x2 v1]
  (->> x2
       (map #(vector % (interpolate-point % v1)))
       #_(filter #(some? (second %)))))

;;New mods that get us down ~5x over the intermediate improvement.

;;More efficient algorithm for lerping.  We now compute
;;a sequence of lines, and later step through that sequence
;;with our input x values, projecting onto the lines to
;;get a lerp'd result.  This is not identical to the simple
;;averaging that was happening before, but you can do that as
;;well.  This is actually more general in that you get a piecewise
;;linear approximation of the underlying function, which
;;forms the lerp basis.  Algorthimically, we have a much more
;;efficient implementation, since it's only O(N) in the size of the
;;input, where the original implementation was doing intersections for
;;every point in order to find the applicable range.  This way,
;;we just scan through the input in one batch, and lerp lazily as we go.

(defn linear-ranges [points]
  (for [[l r]  (partition 2 1 points)]
    (let [[x1 y1] l
          [x2 y2] r]
      {:l x1
       :r x2
       :x0    x1
       :y0    y1
       :m    (double (/ (- y2 y1) (- x2 x1)))})))

;;project x onto f(x) = m(x - x0) + y0
(defn lerp [stats x]
  (let [^double m  (stats :m)
        ^double x0 (stats :x0)
        ^double y0 (stats :y0)]
    (+ (* (- ^double x x0) m) y0)))

(defn lerp-points [xs vs]
  (when-let [x (first xs)]
    (when-let [v (first vs)]
      (lazy-seq
       (if (<= x (v :r))
         (cons [x (lerp v x)]
               (lerp-points (rest xs) vs))
         (lerp-points xs (rest vs)))))))

(defn interpolate-points
  "Given a vector of X2 values and a vector V1 of [X1 Y1] pairs
  this returns an [X2 Y2] values, Where the Y2's are interpolated linearly"
  [x2 v1]
  (->> v1
       sort
       linear-ranges
       (lerp-points (sort x2))
       vec))

;;unchanged
(defn interpolate-overlap
  "Take a 2 point vector PV1 PV2. Crop PV1 to the overlapping segment.
  Interpolate PV2 points to PV1 positions
  Return the cropped PV1 and interpolated PV2 (they will be the same length)"
  [pv1
   pv2]
  (let [first-pv2-position (-> pv2
                              first
                              first)
        last-pv2-position (-> pv2
                             last
                             first)
        trimmed-pv1 (->> pv1
                        ;; drop all preceeding points with no overlap
                        (drop-while #(->> %
                                          first
                                          (>= first-pv2-position)))
                        ;; only take remaining points with overlap
                        (take-while #(->> %
                                          first
                                          (> last-pv2-position))))
        interpolation-positions (map first trimmed-pv1)
        interpolated-pv2 (interpolate-points interpolation-positions
                                             pv2)]
        [trimmed-pv1 interpolated-pv2]))

;;minor modification to avoid invoking count in case we have an O(n) count.
;;not a big deal, but shaved a few ms.
(defn mean [coll]
  (let [[sum n]
          (reduce (fn [[sum n] x]
                    [(+ sum x) (unchecked-inc n)]) [0 0] coll)]
    (/ (double sum) n)))

#_(defn mean
    [coll]
    (assert (not-empty coll) "calculating the mean of an empty vector")
    (/ (reduce + coll) (count coll)))

;;substantial modification.  We now compute most of what we
;;need in 3 passes (if you count the means) vs. 5.
;;Also double type hints help avoid generic numerics.
;;Saves several hundred ms.
(defn vector-correlation-coefficient [xs ys]
  (let [xs    (vec xs)
        ys    (vec ys)
        x-mean (mean xs)
        y-mean (mean ys)
        {:keys [sdxy varx vary]}
           (reduce (fn blee [acc idx]
                     (let [x   (xs idx)
                           y   (ys idx)
                           ^double sdxy  (acc :sdxy)
                           ^double varx  (acc :varx)
                           ^double vary  (acc :vary)
                            xd (- ^double x  x-mean)
                            yd (- ^double y  y-mean)]
                       {:sdxy  (+  sdxy ^double (* xd yd))
                        :varx  (+  varx ^double (Math/pow xd 2.0))
                        :vary  (+  vary ^double (Math/pow yd 2.0))})) {:sdxy 0.0 :varx 0.0 :vary 0.0}
                   (range (count xs)))]
    (/ ^double sdxy
       ^double (Math/sqrt ^double (* ^double varx ^double vary)))))

#_(defn- vector-correlation-coefficient
  "Get the correlation-coefficient of two vectors.
  The vectors obviously should be the same length"
  [x
   y]
  (let [x-mean (mean x)
        y-mean (mean y)]
    (/ (reduce +
               (map (fn [xi yi]
                      (* (- xi
                            x-mean)
                         (- yi
                            y-mean)))
                    x
                    y))
       (Math/sqrt (* (reduce +
                             (map (fn [xi]
                                    (* (Math/pow (- xi
                                                    x-mean)
                                                 2.0)))
                                  x))
                     (reduce +
                             (map (fn [yi]
                                    (Math/pow (- yi
                                                 y-mean)
                                              2.0))
                                  y)))))))

(defn points-correlation-coefficient
  "Take TWO element count vectors of the form
  [[position value]
   [position value]
   [position value]
   ...]
  and calculate their correlation.
  Takes an options V2-SHIFT values to adjust v2 postions"
  ([v1
    v2
    v2-shift]
   (let [shifted-v2 (map #(vector (-> %
                                      first
                                      (+ v2-shift))
                                  (-> %
                                      second))
                         v2)]
     #_(print " "
            v2-shift)
     (points-correlation-coefficient v1
                                     shifted-v2)))
  ([v1
    v2]
   (let [[trimmed-v1 trimmed-v2] (interpolate-overlap v1 v2)]
     (vector-correlation-coefficient (map second trimmed-v1)
                                     (map second trimmed-v2)))))

(defn points-cross-correlation
  "Take TWO element count vectors of the form
  [[position value]
   [position value]
   [position value]
   ...]"
  [v1
   v2
   step-size]
  (let [shifts (-> (range (+ (- (first (last v2)))
                             (first (first v1)))
                          (- (first (last v1))
                             (first (first v2)))
                          step-size)
                   rest
                   butlast)
        correlations (map #(points-correlation-coefficient v1
                                                           v2
                                                           %)
                          shifts)]
    (zipmap shifts
            correlations)))

	;;These mods pulled runtime down about 37x without going into primitive arrays,
	;;and retaining a good bit of the original code.

	;;Intermediate mods that got us down about 12x, keeping fairly clost to the original.
	#_(defn split-adjacent [f xs]
	(let [prev (atom nil)
	res (reduce (fn [acc x]
	(if (f x)
	(reset! prev x)
	(reduced x))) nil xs)]
	[@prev res]))

	#_(defn interpolate-point
	"Given a vector V1 of [X1 Y1] pairs and a vector of X2 values
	this returns an [X2 Y2} values, Where the Y2's are interpolated linearly"
	[x2 v1]
	(let [[preceeding-point following-point]
	(split-adjacent #(<= ^double (nth % 0) x2) v1)]
	(if (or (nil? preceeding-point)
	(nil? following-point))
	nil
	(/ (+ (nth preceeding-point 1)
	(nth following-point 1))
	2.0))))

	#_(defn interpolate-points
	"Given a vector of X2 values and a vector V1 of [X1 Y1] pairs
	this returns an [X2 Y2] values, Where the Y2's are interpolated linearly"
	[x2 v1]
	(->> x2
	(map #(vector % (interpolate-point % v1)))
	#_(filter #(some? (second %)))))

	;;New mods that get us down ~5x over the intermediate improvement.

	;;More efficient algorithm for lerping. We now compute
	;;a sequence of lines, and later step through that sequence
	;;with our input x values, projecting onto the lines to
	;;get a lerp'd result. This is not identical to the simple
	;;averaging that was happening before, but you can do that as
	;;well. This is actually more general in that you get a piecewise
	;;linear approximation of the underlying function, which
	;;forms the lerp basis. Algorthimically, we have a much more
	;;efficient implementation, since it's only O(N) in the size of the
	;;input, where the original implementation was doing intersections for
	;;every point in order to find the applicable range. This way,
	;;we just scan through the input in one batch, and lerp lazily as we go.

	(defn linear-ranges [points]
	(for [[l r] (partition 2 1 points)]
	(let [[x1 y1] l
	[x2 y2] r]
	{:l x1
	:r x2
	:x0 x1
	:y0 y1
	:m (double (/ (- y2 y1) (- x2 x1)))})))

	;;project x onto f(x) = m(x - x0) + y0
	(defn lerp [stats x]
	(let [^double m (stats :m)
	^double x0 (stats :x0)
	^double y0 (stats :y0)]
	(+ (* (- ^double x x0) m) y0)))

	(defn lerp-points [xs vs]
	(when-let [x (first xs)]
	(when-let [v (first vs)]
	(lazy-seq
	(if (<= x (v :r))
	(cons [x (lerp v x)]
	(lerp-points (rest xs) vs))
	(lerp-points xs (rest vs)))))))

	(defn interpolate-points
	"Given a vector of X2 values and a vector V1 of [X1 Y1] pairs
	this returns an [X2 Y2] values, Where the Y2's are interpolated linearly"
	[x2 v1]
	(->> v1
	sort
	linear-ranges
	(lerp-points (sort x2))
	vec))

	;;unchanged
	(defn interpolate-overlap
	"Take a 2 point vector PV1 PV2. Crop PV1 to the overlapping segment.
	Interpolate PV2 points to PV1 positions
	Return the cropped PV1 and interpolated PV2 (they will be the same length)"
	[pv1
	pv2]
	(let [first-pv2-position (-> pv2
	first
	first)
	last-pv2-position (-> pv2
	last
	first)
	trimmed-pv1 (->> pv1
	;; drop all preceeding points with no overlap
	(drop-while #(->> %
	first
	(>= first-pv2-position)))
	;; only take remaining points with overlap
	(take-while #(->> %
	first
	(> last-pv2-position))))
	interpolation-positions (map first trimmed-pv1)
	interpolated-pv2 (interpolate-points interpolation-positions
	pv2)]
	[trimmed-pv1 interpolated-pv2]))

	;;minor modification to avoid invoking count in case we have an O(n) count.
	;;not a big deal, but shaved a few ms.
	(defn mean [coll]
	(let [[sum n]
	(reduce (fn [[sum n] x]
	[(+ sum x) (unchecked-inc n)]) [0 0] coll)]
	(/ (double sum) n)))

	#_(defn mean
	[coll]
	(assert (not-empty coll) "calculating the mean of an empty vector")
	(/ (reduce + coll) (count coll)))

	;;substantial modification. We now compute most of what we
	;;need in 3 passes (if you count the means) vs. 5.
	;;Also double type hints help avoid generic numerics.
	;;Saves several hundred ms.
	(defn vector-correlation-coefficient [xs ys]
	(let [xs (vec xs)
	ys (vec ys)
	x-mean (mean xs)
	y-mean (mean ys)
	{:keys [sdxy varx vary]}
	(reduce (fn blee [acc idx]
	(let [x (xs idx)
	y (ys idx)
	^double sdxy (acc :sdxy)
	^double varx (acc :varx)
	^double vary (acc :vary)
	xd (- ^double x x-mean)
	yd (- ^double y y-mean)]
	{:sdxy (+ sdxy ^double (* xd yd))
	:varx (+ varx ^double (Math/pow xd 2.0))
	:vary (+ vary ^double (Math/pow yd 2.0))})) {:sdxy 0.0 :varx 0.0 :vary 0.0}
	(range (count xs)))]
	(/ ^double sdxy
	^double (Math/sqrt ^double (* ^double varx ^double vary)))))

	#_(defn- vector-correlation-coefficient
	"Get the correlation-coefficient of two vectors.
	The vectors obviously should be the same length"
	[x
	y]
	(let [x-mean (mean x)
	y-mean (mean y)]
	(/ (reduce +
	(map (fn [xi yi]
	(* (- xi
	x-mean)
	(- yi
	y-mean)))
	x
	y))
	(Math/sqrt (* (reduce +
	(map (fn [xi]
	(* (Math/pow (- xi
	x-mean)
	2.0)))
	x))
	(reduce +
	(map (fn [yi]
	(Math/pow (- yi
	y-mean)
	2.0))
	y)))))))

	(defn points-correlation-coefficient
	"Take TWO element count vectors of the form
	[[position value]
	[position value]
	[position value]
	...]
	and calculate their correlation.
	Takes an options V2-SHIFT values to adjust v2 postions"
	([v1
	v2
	v2-shift]
	(let [shifted-v2 (map #(vector (-> %
	first
	(+ v2-shift))
	(-> %
	second))
	v2)]
	#_(print " "
	v2-shift)
	(points-correlation-coefficient v1
	shifted-v2)))
	([v1
	v2]
	(let [[trimmed-v1 trimmed-v2] (interpolate-overlap v1 v2)]
	(vector-correlation-coefficient (map second trimmed-v1)
	(map second trimmed-v2)))))

	(defn points-cross-correlation
	"Take TWO element count vectors of the form
	[[position value]
	[position value]
	[position value]
	...]"
	[v1
	v2
	step-size]
	(let [shifts (-> (range (+ (- (first (last v2)))
	(first (first v1)))
	(- (first (last v1))
	(first (first v2)))
	step-size)
	rest
	butlast)
	correlations (map #(points-correlation-coefficient v1
	v2
	%)
	shifts)]
	(zipmap shifts
	correlations)))