-
-
Save emanjavacas/d8672fa88016a7a3e81a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn sliding-window | |
[target s n] | |
(loop [todo s seen [] acc []] | |
(let [curr (first todo)] | |
(cond (= curr target) (recur (rest todo) (cons curr seen) (concat acc (take n seen) (take n (rest todo)))) | |
(empty? todo) acc | |
:else (recur (rest todo) (cons curr seen) acc))))) | |
(defn compute-space | |
"docs is an iterable of already preprocessed sents, context-value-fn | |
determine which word belongs to the context and weights it. | |
targets is a coll of target-words" | |
([docs context-fn] (compute-space docs context-fn false)) | |
([docs context-fn targets] | |
(let [space (atom {})] | |
(doseq [doc docs | |
target (or targets doc)] | |
(when-let [contexts (context-fn target doc)] | |
(doseq [w contexts] | |
(if (get-in @space [target w]) | |
(swap! space update-in [target w] inc) | |
(swap! space assoc-in [target w] 1))))) | |
@space))) | |
(defn pcompute-space | |
[docs step context-fn targets] | |
(reduce | |
#(deep-merge-with + %1 %2) | |
(pmap | |
(fn [chunk] | |
(compute-space chunk context-fn targets)) | |
(partition-all step docs)))) | |
(defn compute-context-map [docs context-fn target] | |
(->> (mapcat #(context-fn target %) docs) ; get the context words for a given target | |
(group-by identity) | |
(map (fn [[k v]] [k (count v)])) | |
(into {}))) | |
(defn compute-space-2 | |
([docs context-fn] (compute-space docs context-fn false)) | |
([docs context-fn targets] | |
(into {} (map #(vector % (compute-context-map docs context-fn %)) targets)))) | |
(defn pcompute-space-2 | |
[docs step context-fn targets] | |
(into {} (pmap #(compute-space docs context-fn %) (partition-all step targets)))) | |
(defn pcompute-space | |
[docs step context-fn targets] | |
(reduce | |
#(deep-merge-with + %1 %2) | |
(pmap | |
(fn [chunk] | |
(compute-space chunk context-fn targets)) | |
(partition-all step docs)))) | |
;; file reading | |
(defn lazy-lines [in-fname & {:keys [input] :or {input :file}}] | |
(letfn [(helper [rdr] | |
(lazy-seq (if-let [line (.readLine rdr)] | |
(cons line (helper rdr)) | |
(do (.close rdr) nil))))] | |
(helper (io/reader in-fname)))) | |
(defn- next-chunk [coll] | |
(loop [curr coll acc []] | |
(when-let [item (first curr)] | |
(cond (re-matches #"</s>" item) acc | |
(re-matches #"<s>" item) (recur (rest curr) acc) | |
:else (recur (rest curr) (conj acc item)))))) | |
(defn- parse-vrt | |
[lines] | |
(lazy-seq | |
(when-let [s (seq lines)] | |
(let [chunk (next-chunk s)] | |
(cons chunk (parse-vrt (nthrest s (+ 2 (count chunk))))))))) | |
(defn parse-corpus | |
"a seq of lines, if format is word per line | |
input is parsed to seq of sents with the help | |
of sent separators (parse-vrt)." | |
[lines] | |
(letfn [(->str [ids] | |
(for [[token pos lemma] (map #(s/split % #"\t") ids)] | |
(str lemma "_" pos)))] | |
(map ->str (parse-vrt lines)))) | |
; BENCHMARKING | |
(def lines (parse-corpus (lazy-lines "lines.test"))) | |
(def targets '("Flughafen_NN" "analysieren_V" "Design_NN" "Einrichtung_NN" "dauern_V" "nehmen_V" "Frage_NN" "Irrtum_NN" "schwimmen_V" "Ansehen_NN" "Analyse_NN" "anfertigen_V" "Leben_NN" "Symbol_NN" "objektiv_ADJ" "Post_NN" "versichern_V" "Zeugnis_NN" "Sohn_NN" "Frau_NN" "Aufstieg_NN" "Schreibtisch_NN" "Herkunft_NN" "arbeiten_V" "Stunde_NN" "Tier_NN" "sportlich_ADJ" "moderat_ADJ" "Professor_NN" "Gewicht_NN" "bewundern_V" "deutsch_ADJ" "deutsch_NN" "hell_ADJ" "beschuldigen_V" "Eleganz_NN" "ernst_ADJ" "Petersdom_NN" "Montag_NN" "schreiben_V")) | |
(bench (compute-space lines (fn [t s] (sliding-window t s 5)) targets)) | |
;; Evaluation count : 60 in 60 samples of 1 calls. | |
;; Execution time mean : 5.026049 sec | |
;; Execution time std-deviation : 176.960668 ms | |
;; Execution time lower quantile : 4.873090 sec ( 2.5%) | |
;; Execution time upper quantile : 5.394192 sec (97.5%) | |
;; Overhead used : 3.350213 ns | |
;; Found 2 outliers in 60 samples (3.3333 %) | |
;; low-severe 1 (1.6667 %) | |
;; low-mild 1 (1.6667 %) | |
;; Variance from outliers : 22.1896 % Variance is moderately inflated by outliers | |
(bench (compute-space-2 lines (fn [t s] (sliding-window t s 5)) targets)) | |
;; Evaluation count : 60 in 60 samples of 1 calls. | |
;; Execution time mean : 6.950046 sec | |
;; Execution time std-deviation : 101.673150 ms | |
;; Execution time lower quantile : 6.814589 sec ( 2.5%) | |
;; Execution time upper quantile : 7.136314 sec (97.5%) | |
;; Overhead used : 3.350213 ns | |
(bench (pcompute-space lines (fn [t s] (sliding-window t s 5)) targets)) | |
;; Evaluation count : 60 in 60 samples of 1 calls. | |
;; Execution time mean : 3.190568 sec | |
;; Execution time std-deviation : 204.775262 ms | |
;; Execution time lower quantile : 2.986710 sec ( 2.5%) | |
;; Execution time upper quantile : 3.597555 sec (97.5%) | |
;; Overhead used : 3.350213 ns | |
;; Found 3 outliers in 60 samples (5.0000 %) | |
;; low-severe 2 (3.3333 %) | |
;; low-mild 1 (1.6667 %) | |
;; Variance from outliers : 48.4195 % Variance is moderately inflated by outliers | |
(bench (pcompute-space-2 lines (fn [t s] (sliding-window t s 5)) targets)) | |
;; Evaluation count : 60 in 60 samples of 1 calls. | |
;; Execution time mean : 3.050350 sec, | |
;; Execution time std-deviation : 177.137265 ms | |
;; Execution time lower quantile : 2.881471 sec ( 2.5%) | |
;; Execution time upper quantile : 3.390317 sec (97.5%) | |
;; Overhead used : 3.350213 ns |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment