Skip to content

Instantly share code, notes, and snippets.

@si14

si14/temp.clj Secret

Created April 1, 2013 22:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save si14/6781aa479e696b5690b9 to your computer and use it in GitHub Desktop.
Save si14/6781aa479e696b5690b9 to your computer and use it in GitHub Desktop.
(defn viterbi [qs es ks xs]
(clojure.pprint/pprint "Hello world")
(let [initial-ctx {:qs qs, :es es, :pis-prev {["*" "*"] 1.0}, :bps []}
ks-adder (fn [idx x] (case idx
0 [x #{"*"} #{"*"} ks]
1 [x #{"*"} ks ks]
[x ks ks ks]))
xs-and-ks (into [] (map-indexed ks-adder xs))
{:keys [bps pis-prev]} (reduce viterbi-folder initial-ctx xs-and-ks)
[[_ y-last :as ys-tail] _] (argmax-max (fn [[u v]] (* (pis-prev [u v])
(qs ["STOP" u v])))
(for [u ks v ks] [u v]))
backtrack (->> (reductions (fn [[y1 y2 :as t] bt] [(bt t) y1])
ys-tail (rseq bps))
(map first)
(cons y-last)
(reverse)
(drop 2)
(into []))]
backtrack))
(defn part2-tag [file-in file-out]
(let [{wordtags Wordtag,
two-grams Two-gram,
three-grams Three-gram} (read-grouped-data rared-counts-file)
tag-counts (count-tags wordtags)
word-counts (count-words wordtags)
es (get-es wordtags tag-counts)
qs (get-qs three-grams two-grams)
tagged-file (for [sentence (clojure.string/split (slurp file-in)
#"\n\n")
:let [
words (clojure.string/split-lines sentence)
rared (map #(if (< (or (word-counts %) 0) 5)
"_RARE_" %)
words)
xs (conj (into [] rared) "STOP")
tags (viterbi qs es #{"I-GENE" "O"} xs) ;; CAUSES NPE WHEN UNCOMMENTED WITHOUT DEBUG PPRINTING
]]
(do (clojure.pprint/pprint (take 3 xs))
xs)
;; tags
)]
#_(spit file-out (str (clojure.string/join "\n" tagged-file) "\n"))
tagged-file))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment