Skip to content

Instantly share code, notes, and snippets.

@emanjavacas
Last active August 29, 2015 14:18
Show Gist options
  • Save emanjavacas/d8672fa88016a7a3e81a to your computer and use it in GitHub Desktop.
Save emanjavacas/d8672fa88016a7a3e81a to your computer and use it in GitHub Desktop.
(defn sliding-window
[target s n]
(loop [todo s seen [] acc []]
(let [curr (first todo)]
(cond (= curr target) (recur (rest todo) (cons curr seen) (concat acc (take n seen) (take n (rest todo))))
(empty? todo) acc
:else (recur (rest todo) (cons curr seen) acc)))))
(defn compute-space
"docs is an iterable of already preprocessed sents, context-value-fn
determine which word belongs to the context and weights it.
targets is a coll of target-words"
([docs context-fn] (compute-space docs context-fn false))
([docs context-fn targets]
(let [space (atom {})]
(doseq [doc docs
target (or targets doc)]
(when-let [contexts (context-fn target doc)]
(doseq [w contexts]
(if (get-in @space [target w])
(swap! space update-in [target w] inc)
(swap! space assoc-in [target w] 1)))))
@space)))
(defn pcompute-space
[docs step context-fn targets]
(reduce
#(deep-merge-with + %1 %2)
(pmap
(fn [chunk]
(compute-space chunk context-fn targets))
(partition-all step docs))))
(defn compute-context-map [docs context-fn target]
(->> (mapcat #(context-fn target %) docs) ; get the context words for a given target
(group-by identity)
(map (fn [[k v]] [k (count v)]))
(into {})))
(defn compute-space-2
([docs context-fn] (compute-space docs context-fn false))
([docs context-fn targets]
(into {} (map #(vector % (compute-context-map docs context-fn %)) targets))))
(defn pcompute-space-2
[docs step context-fn targets]
(into {} (pmap #(compute-space docs context-fn %) (partition-all step targets))))
(defn pcompute-space
[docs step context-fn targets]
(reduce
#(deep-merge-with + %1 %2)
(pmap
(fn [chunk]
(compute-space chunk context-fn targets))
(partition-all step docs))))
;; file reading
(defn lazy-lines [in-fname & {:keys [input] :or {input :file}}]
(letfn [(helper [rdr]
(lazy-seq (if-let [line (.readLine rdr)]
(cons line (helper rdr))
(do (.close rdr) nil))))]
(helper (io/reader in-fname))))
(defn- next-chunk [coll]
(loop [curr coll acc []]
(when-let [item (first curr)]
(cond (re-matches #"</s>" item) acc
(re-matches #"<s>" item) (recur (rest curr) acc)
:else (recur (rest curr) (conj acc item))))))
(defn- parse-vrt
[lines]
(lazy-seq
(when-let [s (seq lines)]
(let [chunk (next-chunk s)]
(cons chunk (parse-vrt (nthrest s (+ 2 (count chunk)))))))))
(defn parse-corpus
"a seq of lines, if format is word per line
input is parsed to seq of sents with the help
of sent separators (parse-vrt)."
[lines]
(letfn [(->str [ids]
(for [[token pos lemma] (map #(s/split % #"\t") ids)]
(str lemma "_" pos)))]
(map ->str (parse-vrt lines))))
; BENCHMARKING
(def lines (parse-corpus (lazy-lines "lines.test")))
(def targets '("Flughafen_NN" "analysieren_V" "Design_NN" "Einrichtung_NN" "dauern_V" "nehmen_V" "Frage_NN" "Irrtum_NN" "schwimmen_V" "Ansehen_NN" "Analyse_NN" "anfertigen_V" "Leben_NN" "Symbol_NN" "objektiv_ADJ" "Post_NN" "versichern_V" "Zeugnis_NN" "Sohn_NN" "Frau_NN" "Aufstieg_NN" "Schreibtisch_NN" "Herkunft_NN" "arbeiten_V" "Stunde_NN" "Tier_NN" "sportlich_ADJ" "moderat_ADJ" "Professor_NN" "Gewicht_NN" "bewundern_V" "deutsch_ADJ" "deutsch_NN" "hell_ADJ" "beschuldigen_V" "Eleganz_NN" "ernst_ADJ" "Petersdom_NN" "Montag_NN" "schreiben_V"))
(bench (compute-space lines (fn [t s] (sliding-window t s 5)) targets))
;; Evaluation count : 60 in 60 samples of 1 calls.
;; Execution time mean : 5.026049 sec
;; Execution time std-deviation : 176.960668 ms
;; Execution time lower quantile : 4.873090 sec ( 2.5%)
;; Execution time upper quantile : 5.394192 sec (97.5%)
;; Overhead used : 3.350213 ns
;; Found 2 outliers in 60 samples (3.3333 %)
;; low-severe 1 (1.6667 %)
;; low-mild 1 (1.6667 %)
;; Variance from outliers : 22.1896 % Variance is moderately inflated by outliers
(bench (compute-space-2 lines (fn [t s] (sliding-window t s 5)) targets))
;; Evaluation count : 60 in 60 samples of 1 calls.
;; Execution time mean : 6.950046 sec
;; Execution time std-deviation : 101.673150 ms
;; Execution time lower quantile : 6.814589 sec ( 2.5%)
;; Execution time upper quantile : 7.136314 sec (97.5%)
;; Overhead used : 3.350213 ns
(bench (pcompute-space lines (fn [t s] (sliding-window t s 5)) targets))
;; Evaluation count : 60 in 60 samples of 1 calls.
;; Execution time mean : 3.190568 sec
;; Execution time std-deviation : 204.775262 ms
;; Execution time lower quantile : 2.986710 sec ( 2.5%)
;; Execution time upper quantile : 3.597555 sec (97.5%)
;; Overhead used : 3.350213 ns
;; Found 3 outliers in 60 samples (5.0000 %)
;; low-severe 2 (3.3333 %)
;; low-mild 1 (1.6667 %)
;; Variance from outliers : 48.4195 % Variance is moderately inflated by outliers
(bench (pcompute-space-2 lines (fn [t s] (sliding-window t s 5)) targets))
;; Evaluation count : 60 in 60 samples of 1 calls.
;; Execution time mean : 3.050350 sec,
;; Execution time std-deviation : 177.137265 ms
;; Execution time lower quantile : 2.881471 sec ( 2.5%)
;; Execution time upper quantile : 3.390317 sec (97.5%)
;; Overhead used : 3.350213 ns
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment