-
-
Save si14/6781aa479e696b5690b9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn viterbi [qs es ks xs] | |
(clojure.pprint/pprint "Hello world") | |
(let [initial-ctx {:qs qs, :es es, :pis-prev {["*" "*"] 1.0}, :bps []} | |
ks-adder (fn [idx x] (case idx | |
0 [x #{"*"} #{"*"} ks] | |
1 [x #{"*"} ks ks] | |
[x ks ks ks])) | |
xs-and-ks (into [] (map-indexed ks-adder xs)) | |
{:keys [bps pis-prev]} (reduce viterbi-folder initial-ctx xs-and-ks) | |
[[_ y-last :as ys-tail] _] (argmax-max (fn [[u v]] (* (pis-prev [u v]) | |
(qs ["STOP" u v]))) | |
(for [u ks v ks] [u v])) | |
backtrack (->> (reductions (fn [[y1 y2 :as t] bt] [(bt t) y1]) | |
ys-tail (rseq bps)) | |
(map first) | |
(cons y-last) | |
(reverse) | |
(drop 2) | |
(into []))] | |
backtrack)) | |
(defn part2-tag [file-in file-out] | |
(let [{wordtags Wordtag, | |
two-grams Two-gram, | |
three-grams Three-gram} (read-grouped-data rared-counts-file) | |
tag-counts (count-tags wordtags) | |
word-counts (count-words wordtags) | |
es (get-es wordtags tag-counts) | |
qs (get-qs three-grams two-grams) | |
tagged-file (for [sentence (clojure.string/split (slurp file-in) | |
#"\n\n") | |
:let [ | |
words (clojure.string/split-lines sentence) | |
rared (map #(if (< (or (word-counts %) 0) 5) | |
"_RARE_" %) | |
words) | |
xs (conj (into [] rared) "STOP") | |
tags (viterbi qs es #{"I-GENE" "O"} xs) ;; CAUSES NPE WHEN UNCOMMENTED WITHOUT DEBUG PPRINTING | |
]] | |
(do (clojure.pprint/pprint (take 3 xs)) | |
xs) | |
;; tags | |
)] | |
#_(spit file-out (str (clojure.string/join "\n" tagged-file) "\n")) | |
tagged-file)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment