emanjavacas/optimizing coocurrance counts Secret

## optimizing coocurrance counts
(defn sliding-window
  [target s n]
  (loop [todo s seen [] acc []]
    (let [curr (first todo)]
      (cond (= curr target) (recur (rest todo) (cons curr seen) (concat acc (take n seen) (take n (rest todo))))
            (empty? todo) acc
            :else (recur (rest todo) (cons curr seen) acc)))))

(defn compute-space
  "docs is an iterable of already preprocessed sents, context-value-fn
  determine which word belongs to the context and weights it.
  targets is a coll of target-words"
  ([docs context-fn] (compute-space docs context-fn false))
  ([docs context-fn targets]
   (let [space (atom {})]
     (doseq [doc docs
             target (or targets doc)]
       (when-let [contexts (context-fn target doc)]
         (doseq [w contexts]
           (if (get-in @space [target w])
             (swap! space update-in [target w] inc)
             (swap! space assoc-in  [target w] 1)))))
     @space)))

(defn pcompute-space
  [docs step context-fn targets]
  (reduce
   #(deep-merge-with + %1 %2)
   (pmap
    (fn [chunk]
      (compute-space chunk context-fn targets))
    (partition-all step docs))))

(defn compute-context-map [docs context-fn target]
  (->> (mapcat #(context-fn target %) docs) ; get the context words for a given target
       (group-by identity)
       (map (fn [[k v]] [k (count v)]))
       (into {})))

(defn compute-space-2
  ([docs context-fn] (compute-space docs context-fn false))
  ([docs context-fn targets]
   (into {} (map #(vector % (compute-context-map docs context-fn %)) targets))))

(defn pcompute-space-2
  [docs step context-fn targets]
  (into {} (pmap #(compute-space docs context-fn %) (partition-all step targets))))

(defn pcompute-space
  [docs step context-fn targets]
  (reduce
   #(deep-merge-with + %1 %2)
   (pmap
    (fn [chunk]
      (compute-space chunk context-fn targets))
    (partition-all step docs))))

;; file reading
(defn lazy-lines [in-fname & {:keys [input] :or {input :file}}]
  (letfn [(helper [rdr]
            (lazy-seq (if-let [line (.readLine rdr)]
                        (cons line (helper rdr))
                        (do (.close rdr) nil))))]
    (helper (io/reader in-fname))))

(defn- next-chunk [coll]
  (loop [curr coll acc []]
    (when-let [item (first curr)]
      (cond (re-matches #"</s>" item) acc
            (re-matches #"<s>" item) (recur (rest curr) acc)
            :else (recur (rest curr) (conj acc item))))))

(defn- parse-vrt
  [lines]
  (lazy-seq
   (when-let [s (seq lines)]
     (let [chunk (next-chunk s)]
       (cons chunk (parse-vrt (nthrest s (+ 2 (count chunk)))))))))

(defn parse-corpus
  "a seq of lines, if format is word per line
  input is parsed to seq of sents with the help
  of sent separators (parse-vrt)."
  [lines]
  (letfn [(->str [ids]
            (for [[token pos lemma] (map #(s/split % #"\t") ids)]
              (str lemma "_" pos)))]
            (map ->str (parse-vrt lines))))

; BENCHMARKING
(def lines (parse-corpus (lazy-lines "lines.test")))
(def targets '("Flughafen_NN" "analysieren_V" "Design_NN" "Einrichtung_NN" "dauern_V" "nehmen_V" "Frage_NN" "Irrtum_NN" "schwimmen_V" "Ansehen_NN" "Analyse_NN" "anfertigen_V" "Leben_NN" "Symbol_NN" "objektiv_ADJ" "Post_NN" "versichern_V" "Zeugnis_NN" "Sohn_NN" "Frau_NN" "Aufstieg_NN" "Schreibtisch_NN" "Herkunft_NN" "arbeiten_V" "Stunde_NN" "Tier_NN" "sportlich_ADJ" "moderat_ADJ" "Professor_NN" "Gewicht_NN" "bewundern_V" "deutsch_ADJ" "deutsch_NN" "hell_ADJ" "beschuldigen_V" "Eleganz_NN" "ernst_ADJ" "Petersdom_NN" "Montag_NN" "schreiben_V"))

(bench (compute-space lines (fn [t s] (sliding-window t s 5)) targets))
;; Evaluation count : 60 in 60 samples of 1 calls.
;;              Execution time mean : 5.026049 sec
;;     Execution time std-deviation : 176.960668 ms
;;    Execution time lower quantile : 4.873090 sec ( 2.5%)
;;    Execution time upper quantile : 5.394192 sec (97.5%)
;;                    Overhead used : 3.350213 ns

;; Found 2 outliers in 60 samples (3.3333 %)
;; 	low-severe	 1 (1.6667 %)
;; 	low-mild	 1 (1.6667 %)
;;  Variance from outliers : 22.1896 % Variance is moderately inflated by outliers

(bench (compute-space-2 lines (fn [t s] (sliding-window t s 5)) targets))
;; Evaluation count : 60 in 60 samples of 1 calls.
;;           Execution time mean : 6.950046 sec
;;  Execution time std-deviation : 101.673150 ms
;; Execution time lower quantile : 6.814589 sec ( 2.5%)
;; Execution time upper quantile : 7.136314 sec (97.5%)
;;                 Overhead used : 3.350213 ns

(bench (pcompute-space lines (fn [t s] (sliding-window t s 5)) targets))
;; Evaluation count : 60 in 60 samples of 1 calls.
;;              Execution time mean : 3.190568 sec
;;     Execution time std-deviation : 204.775262 ms
;;    Execution time lower quantile : 2.986710 sec ( 2.5%)
;;    Execution time upper quantile : 3.597555 sec (97.5%)
;;                    Overhead used : 3.350213 ns

;; Found 3 outliers in 60 samples (5.0000 %)
;; 	low-severe	 2 (3.3333 %)
;; 	low-mild	 1 (1.6667 %)
;;  Variance from outliers : 48.4195 % Variance is moderately inflated by outliers

(bench (pcompute-space-2 lines (fn [t s] (sliding-window t s 5)) targets))
;; Evaluation count : 60 in 60 samples of 1 calls.
;;              Execution time mean : 3.050350 sec,
;;     Execution time std-deviation : 177.137265 ms
;;    Execution time lower quantile : 2.881471 sec ( 2.5%)
;;    Execution time upper quantile : 3.390317 sec (97.5%)
;;                    Overhead used : 3.350213 ns
	(defn sliding-window
	[target s n]
	(loop [todo s seen [] acc []]
	(let [curr (first todo)]
	(cond (= curr target) (recur (rest todo) (cons curr seen) (concat acc (take n seen) (take n (rest todo))))
	(empty? todo) acc
	:else (recur (rest todo) (cons curr seen) acc)))))

	(defn compute-space
	"docs is an iterable of already preprocessed sents, context-value-fn
	determine which word belongs to the context and weights it.
	targets is a coll of target-words"
	([docs context-fn] (compute-space docs context-fn false))
	([docs context-fn targets]
	(let [space (atom {})]
	(doseq [doc docs
	target (or targets doc)]
	(when-let [contexts (context-fn target doc)]
	(doseq [w contexts]
	(if (get-in @space [target w])
	(swap! space update-in [target w] inc)
	(swap! space assoc-in [target w] 1)))))
	@space)))

	(defn pcompute-space
	[docs step context-fn targets]
	(reduce
	#(deep-merge-with + %1 %2)
	(pmap
	(fn [chunk]
	(compute-space chunk context-fn targets))
	(partition-all step docs))))

	(defn compute-context-map [docs context-fn target]
	(->> (mapcat #(context-fn target %) docs) ; get the context words for a given target
	(group-by identity)
	(map (fn [[k v]] [k (count v)]))
	(into {})))

	(defn compute-space-2
	([docs context-fn] (compute-space docs context-fn false))
	([docs context-fn targets]
	(into {} (map #(vector % (compute-context-map docs context-fn %)) targets))))

	(defn pcompute-space-2
	[docs step context-fn targets]
	(into {} (pmap #(compute-space docs context-fn %) (partition-all step targets))))

	(defn pcompute-space
	[docs step context-fn targets]
	(reduce
	#(deep-merge-with + %1 %2)
	(pmap
	(fn [chunk]
	(compute-space chunk context-fn targets))
	(partition-all step docs))))

	;; file reading
	(defn lazy-lines [in-fname & {:keys [input] :or {input :file}}]
	(letfn [(helper [rdr]
	(lazy-seq (if-let [line (.readLine rdr)]
	(cons line (helper rdr))
	(do (.close rdr) nil))))]
	(helper (io/reader in-fname))))

	(defn- next-chunk [coll]
	(loop [curr coll acc []]
	(when-let [item (first curr)]
	(cond (re-matches #"</s>" item) acc
	(re-matches #"<s>" item) (recur (rest curr) acc)
	:else (recur (rest curr) (conj acc item))))))

	(defn- parse-vrt
	[lines]
	(lazy-seq
	(when-let [s (seq lines)]
	(let [chunk (next-chunk s)]
	(cons chunk (parse-vrt (nthrest s (+ 2 (count chunk)))))))))

	(defn parse-corpus
	"a seq of lines, if format is word per line
	input is parsed to seq of sents with the help
	of sent separators (parse-vrt)."
	[lines]
	(letfn [(->str [ids]
	(for [[token pos lemma] (map #(s/split % #"\t") ids)]
	(str lemma "_" pos)))]
	(map ->str (parse-vrt lines))))

	; BENCHMARKING
	(def lines (parse-corpus (lazy-lines "lines.test")))
	(def targets '("Flughafen_NN" "analysieren_V" "Design_NN" "Einrichtung_NN" "dauern_V" "nehmen_V" "Frage_NN" "Irrtum_NN" "schwimmen_V" "Ansehen_NN" "Analyse_NN" "anfertigen_V" "Leben_NN" "Symbol_NN" "objektiv_ADJ" "Post_NN" "versichern_V" "Zeugnis_NN" "Sohn_NN" "Frau_NN" "Aufstieg_NN" "Schreibtisch_NN" "Herkunft_NN" "arbeiten_V" "Stunde_NN" "Tier_NN" "sportlich_ADJ" "moderat_ADJ" "Professor_NN" "Gewicht_NN" "bewundern_V" "deutsch_ADJ" "deutsch_NN" "hell_ADJ" "beschuldigen_V" "Eleganz_NN" "ernst_ADJ" "Petersdom_NN" "Montag_NN" "schreiben_V"))

	(bench (compute-space lines (fn [t s] (sliding-window t s 5)) targets))
	;; Evaluation count : 60 in 60 samples of 1 calls.
	;; Execution time mean : 5.026049 sec
	;; Execution time std-deviation : 176.960668 ms
	;; Execution time lower quantile : 4.873090 sec ( 2.5%)
	;; Execution time upper quantile : 5.394192 sec (97.5%)
	;; Overhead used : 3.350213 ns

	;; Found 2 outliers in 60 samples (3.3333 %)
	;; low-severe 1 (1.6667 %)
	;; low-mild 1 (1.6667 %)
	;; Variance from outliers : 22.1896 % Variance is moderately inflated by outliers

	(bench (compute-space-2 lines (fn [t s] (sliding-window t s 5)) targets))
	;; Evaluation count : 60 in 60 samples of 1 calls.
	;; Execution time mean : 6.950046 sec
	;; Execution time std-deviation : 101.673150 ms
	;; Execution time lower quantile : 6.814589 sec ( 2.5%)
	;; Execution time upper quantile : 7.136314 sec (97.5%)
	;; Overhead used : 3.350213 ns

	(bench (pcompute-space lines (fn [t s] (sliding-window t s 5)) targets))
	;; Evaluation count : 60 in 60 samples of 1 calls.
	;; Execution time mean : 3.190568 sec
	;; Execution time std-deviation : 204.775262 ms
	;; Execution time lower quantile : 2.986710 sec ( 2.5%)
	;; Execution time upper quantile : 3.597555 sec (97.5%)
	;; Overhead used : 3.350213 ns

	;; Found 3 outliers in 60 samples (5.0000 %)
	;; low-severe 2 (3.3333 %)
	;; low-mild 1 (1.6667 %)
	;; Variance from outliers : 48.4195 % Variance is moderately inflated by outliers

	(bench (pcompute-space-2 lines (fn [t s] (sliding-window t s 5)) targets))
	;; Evaluation count : 60 in 60 samples of 1 calls.
	;; Execution time mean : 3.050350 sec,
	;; Execution time std-deviation : 177.137265 ms
	;; Execution time lower quantile : 2.881471 sec ( 2.5%)
	;; Execution time upper quantile : 3.390317 sec (97.5%)
	;; Overhead used : 3.350213 ns