Folcon/sample6.txt

## sample6.txt


======================================

In/IN
[ an/DT Oct./NNP 19/CD review/NN ]
of/IN ``/``
[ The/DT Misanthrope/NN ]
''/'' at/IN
[ Chicago/NNP 's/POS Goodman/NNP Theatre/NNP ]
 (/(
[ ``/`` Revitalized/VBN Classics/NNS ]
Take/VBP
[ the/DT Stage/NN ]
in/IN
[ Windy/NNP City/NNP ]
,/, ''/''
[ Leisure/NN ]
&/CC
[ Arts/NNS ]
 )/) ,/,
[ the/DT role/NN ]
of/IN
[ Celimene/NNP ]
,/, played/VBN by/IN
[ Kim/NNP Cattrall/NNP ]
,/, was/VBD mistakenly/RB attributed/VBN to/TO
[ Christina/NNP Haag/NNP ]
./.

[ Ms./NNP Haag/NNP ]
plays/VBZ
[ Elianti/NNP ]
./.

======================================

 (/( See/VB :/: ``/``
[ Revitalized/VBN Classics/NNS ]
Take/VBP
[ the/DT Stage/NN ]
in/IN
[ Windy/NNP City/NNP ]
''/'' --/:
[ WSJ/NNP Oct./NNP 19/CD ]
,/,
[ 1989/CD ]

)/)
======================================


======================================

[ Rolls-Royce/NNP Motor/NNP Cars/NNPS Inc./NNP ]
said/VBD
[ it/PRP ]
expects/VBZ
[ its/PRP$ U.S./NNP sales/NNS ]
to/TO remain/VB
[ steady/JJ ]
at/IN about/IN
[ 1,200/CD cars/NNS ]
in/IN
[ 1990/CD ]
./.

[ The/DT luxury/NN auto/NN maker/NN last/JJ year/NN ]
sold/VBD
[ 1,214/CD cars/NNS ]
in/IN
[ the/DT U.S./NNP Howard/NNP Mosher/NNP ]
,/,
[ president/NN ]
and/CC
[ chief/JJ executive/NN officer/NN ]
,/, said/VBD
[ he/PRP ]
anticipates/VBZ
[ growth/NN ]
for/IN
[ the/DT luxury/NN auto/NN maker/NN ]
in/IN
[ Britain/NNP ]
and/CC
[ Europe/NNP ]
,/, and/CC in/IN
[ Far/JJ Eastern/JJ markets/NNS ]
./.


======================================

[ BELL/NNP INDUSTRIES/NNP Inc./NNP ]
increased/VBD
[ its/PRP$ quarterly/NN ]
to/TO
[ 10/CD cents/NNS ]
from/IN
[ seven/CD cents/NNS ]

[ a/DT share/NN ]
./.

[ The/DT new/JJ rate/NN ]
will/MD be/VB
[ payable/JJ Feb./NNP 15/CD ]
./.

[ A/DT record/NN date/NN has/VBZ n't/RB ]
been/VBN set/VBN ./.

[ Bell/NNP ]
,/, based/VBN in/IN
[ Los/NNP Angeles/NNP ]
,/, makes/VBZ and/CC distributes/VBZ
[ electronic/JJ ]
,/,
[ computer/NN ]
and/CC
[ building/NN products/NNS ]
./.


======================================

[ Investors/NNS ]
are/VBP appealing/VBG to/TO
[ the/DT Securities/NNPS ]
and/CC
[ Exchange/NNP Commission/NNP ]
not/RB to/TO limit/VB
[ their/PRP$ access/NN ]
to/TO
[ information/NN ]
about/IN
[ stock/NN purchases/NNS ]
and/CC
[ sales/NNS ]
by/IN
[ corporate/JJ insiders/NNS ]
./.

======================================

[ A/DT SEC/NNP proposal/NN ]
to/TO ease/VB reporting/NN
[ requirements/NNS ]
for/IN
[ some/DT company/NN executives/NNS ]
would/MD undermine/VB
[ the/DT usefulness/NN ]
of/IN
[ information/NN ]
on/IN
[ insider/NN trades/NNS ]
as/IN
[ a/DT stock-picking/JJ tool/NN ]
,/,
[ individual/JJ investors/NNS ]
and/CC
[ professional/JJ money/NN managers/NNS ]
contend/VBP ./.

## tagger.clj
(ns tagger.core
  "Running Viterbi on some text"
  (:require [clojure.string :as str]
            [clojure.contrib.generic.functor :as functor]
            [clojure.contrib.math :as math]
            [clojure.set :as set]
            [clojure.data :as data]
            [clojure.data.finger-tree :as ft]))

(def ^:dynamic *epsilon* 0.01)

(defn applyAll [fs x]
  (map #(% x) fs))

(defn split-evenly [coll]
  (partition (quot (count coll) 10) coll))

(defn nil?-zero [fn & args]
  (let [val (apply fn args)]
    (if (nil? val)
      0
      val)))

;; Things to watch out for:
;; There are 1942 sentences
;; There are 44 escaped backslashes "\/"
;; There are 5 tags that contain a | which delimits two different tags into 1

;; Counts needed are:
;; word counts W
;; tag counts T
;; word-tag counts W-T
;; previous tag to current tag counts T(i+1)-Ti

;; Problem set
 (def str-to-tags (slurp "resources/treebank3_sect2.txt"))
;; A sample set to play with
;; (def str-to-tags (slurp "resources/sample.txt"))
;; A sample set containing the tags with |
;; (def str-to-tags (slurp "resources/sample2.txt"))
;; A sample set containing words with \/ (escaped /'s)
;; (def str-to-tags (slurp "resources/sample3.txt"))
;; A small sample set
;; (def str-to-tags (slurp "resources/sample4.txt"))
;; A quarter of the problem set
;; (def str-to-tags (slurp "resources/sample5.txt"))
;; 10 sentences
;; (def str-to-tags (slurp "resources/sample6.txt"))

(defn str->tags [string]
  (filter #(not (empty? %))
          (str/split string #"[\s]")))

(defn tag->W-T [tag]
  "Converts a tag such as In/IN into a W-T such as [In IN]"
  (str/split tag #"[//]"))

(defn sentence->tags [sentence]
  (map #(second %) sentence))

(defn strip-tags [tag]
  (first tag))

(defn this-and-that [xs]
  (map-indexed (fn [i x]
                [x (concat (take i xs)
                           (drop (inc i) xs))])
              xs))

(def cleaned-tag-str (filter #(= (count %) 2) (map tag->W-T (str->tags str-to-tags))))

(defn split-sentences [tag-str]
  "Splits the str into sentences"
  (reduce #(if (= (second (first %2)) ".")
             (ft/conjr (pop %1) (ft/conjr (peek %1) (first %2)))
             (ft/conjr %1 %2))
          (ft/double-list)
          (map #(apply ft/double-list %) (partition-by #(= "." (second %)) tag-str))))

(defn split-sentences-start-end [tag-str]
  "Splits the str into sentences with added start and end tags"
  (reduce #(if (= (second (first %2)) ".")
             (conj (vec (drop-last %1)) (conj (vec (conj (vec (conj (last %1) ["START" "START"]))
                                                         (first %2))) ["END" "END"]))
             (conj (vec %1) %2))
          []
          (partition-by #(= "." (second %)) tag-str)))

(def sentences (split-sentences cleaned-tag-str))

(def testing-and-training-sentences
  "A list containing 10 pairs of testing sentences and training sentences"
  (map (fn [[fst rst]] (ft/double-list fst (apply concat rst))) (this-and-that (split-evenly sentences))))

(defn add-start-end [sentence-list]
  (map #(ft/consl (ft/conjr % ["END" "END"]) ["START" "START"]) sentence-list))

(def testing-and-training-sentences-start-end (map #(map add-start-end %) testing-and-training-sentences))

(def training-tag-list-start-end (map (comp #(map sentence->tags %) second) testing-and-training-sentences-start-end))

(def testing-and-training-tag-list-start-end (map (fn [sample] (map #(map sentence->tags %) sample)) testing-and-training-sentences-start-end))

(defn insert [m k]
  "Inserts a key k into a map m if it does not exist or increments the count if it does"
  (let [val (m k)]
    (assoc m k (inc (if (nil? val) 0 val)))))

(defn nested-insert [m [word tag]]
  "Inserts a key k into a nested map m of tags and words if it does not exist or increments the count if it does"
  (let [val (get-in m [tag word])]
    (assoc-in m [tag word] (inc (if (nil? val) 0 val)))))

(defn word-count [tagged-str]
  "Example of how to get word counts"
  (reduce #(insert %1 (first %2)) {} tagged-str))

(defn tag-count [tagged-str]
  "Example of how to get tag counts"
  (reduce #(insert %1 (second %2)) {} tagged-str))

(defn nested-tag-word-count [tagged-str]
  "Nested counts in the format of {tag {word count}}"
  (reduce #(nested-insert %1 %2) {} tagged-str))

(def tag-count-training-list (map #(tag-count (apply concat (second %))) testing-and-training-sentences))

(def word-count-training-list (map #(word-count (apply concat (second %))) testing-and-training-sentences))

(def nested-tag-word-count-training-list (map #(nested-tag-word-count (apply concat (second %))) testing-and-training-sentences))

(defn out-of-step-list [tag-list]
  "Creates a list of vector pairs where the second element is the next values first element"
  (map vector (rest tag-list) tag-list))

(def training-previous-tag-tag-list-start-end (map #(map out-of-step-list %) training-tag-list-start-end))

(def training-tag-count-start-end (map (comp frequencies flatten) training-tag-list-start-end))

(defn nested-previous-tag-tag-count [previous-tag-tag-list]
   "Nested counts in the format of {prior-tag {tag count}}"
   (reduce #(nested-insert %1 %2) {} (apply concat previous-tag-tag-list)))

(def nested-previous-tag-tag-count-training-list (map nested-previous-tag-tag-count training-previous-tag-tag-list-start-end))

(defn unique-keys-count [m]
  (count (keys m)))

(def unique-words-count-training-list (map count word-count-training-list))

;; Calculating probabilities

(defn make-prob-fn-map
  [[nested-t-w-count word-count unique-wc nested-prior-t-t-count tag-count-st-end unique-tc]]
  {:prob-word-given-tag ;; Construct Emission Probabilities
   (fn [word tag]
     (let [word-given-tag (nil?-zero get-in nested-t-w-count [tag word])
           tc (nil?-zero word-count word)]
       (/ (+ word-given-tag *epsilon*)
          (+ tc (* *epsilon* unique-tc)))))
   :prob-tag-given-previous-tag ;; Construct Transition Probabilities
   (fn [tag previous-tag]
     (let [tag-given-prior-tag-prob (nil?-zero get-in nested-prior-t-t-count [previous-tag tag])
           tc (nil?-zero tag-count-st-end previous-tag)]
       (/ (+ tag-given-prior-tag-prob *epsilon*)
          (+ tc (* *epsilon* unique-tc)))))})

(defn viterbi-init [v path obs states start-p emit-p]
  "Initializes viterbi for us"
  (reduce
   #(into %1 {%2 [(* (start-p %2)
                     (emit-p (first obs) %2))
                  (conj path %2)]})
   {}
   states))

(defn extract-prob-state [v path]
  "Extracts the current probability and state for a given [v path]"
  [(first (v path)) path])

(defn viterbi-step [prior obs states trans-p emit-p]
  "Goes through one step of viterbi for us, taking a prior state and performing one step"
  (apply merge (map
                (comp (fn [[path v]] {(last path) [v path]}) #(apply max-key val %) #(apply merge %))
                ((fn [obs]
                   (map #(applyAll (map (comp (fn [[v past-st]]
                                                (fn [current-st]
                                                  {(conj (second (prior past-st)) current-st)
                                                   (* v (trans-p current-st past-st)
                                                      (emit-p obs current-st))}))
                                              (partial extract-prob-state prior)) states) %) states))
                 obs))))

(defn viterbi [observations states start-prob trans-prob emit-prob]
  (let [init (viterbi-init [] [] observations states start-prob emit-prob)]
        (reduce #(viterbi-step %1 %2 states trans-prob emit-prob) init (rest observations))))

(defn viterbi-solution [observations states start-prob trans-prob emit-prob]
  (apply max-key #(first (val %)) (viterbi observations states start-prob trans-prob emit-prob)))

(defn extract-path [solution]
  "Extracts the path from a viterbi solution"
  (second (second solution)))

(defn extract-tag-count [seq]
  (reduce insert {} (flatten (map second (second seq)))))

(defn extract-states [seq]
  (into #{} (flatten (map #(map second %) (second seq)))))

(defn extract-observations [seq]
  (map #(map first %) (first seq)))

(defn extract-testing-tags [seq]
  (map #(map second %) (first seq)))

(defn compare-matches [compare]
  "Compares vector containing a path and testing set and gives the matches"
  (map (comp (fn [m] (/ (nil?-zero m true) (+ (nil?-zero m true) (nil?-zero m false)))) frequencies (fn [[a b]] (map #(= %1 %2) a b))) compare))

(defn average-accuracy [accuracy-scores]
  (/ (apply + accuracy-scores) (double (count accuracy-scores))))

(defn run-viterbi []
  "Runs viterbi with transition and emission calculated using the same training data via cross validation"
  (let [states (map extract-states testing-and-training-sentences)
        observations (map extract-observations testing-and-training-sentences)
        prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
        transition-prob (map :prob-tag-given-previous-tag prob-map)
        emission-prob (map :prob-word-given-tag prob-map)
        start-prob (map (fn [trans-p] #(trans-p % "START")) transition-prob)
        testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
    (map #(map vector %1 %2)
         testing-tags-list
         (map (fn [[obs-list sts start-p trans-p emit-p]]
                (map (fn [obs]
                       (extract-path (viterbi-solution obs sts start-p trans-p emit-p))) obs-list))
              (map vector observations states start-prob transition-prob emission-prob)))))

(defn -main []
  (average-accuracy (map (comp average-accuracy compare-matches) (run-viterbi))))

;; Checking functions

(defn close-to-1 [val]
  (> 0.000001 (math/abs (- 1 val))))

;; Assert that Probabilities are sensible?

(defn check-probs? []
  (assert
   (let [states (map extract-states testing-and-training-sentences)
         observations (map extract-observations testing-and-training-sentences)
         prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
         transition-prob (map :prob-tag-given-previous-tag prob-map)
         emission-prob (map :prob-word-given-tag prob-map)
         start-prob (map (fn [trans-p] #(trans-p % "START")) transition-prob)
         testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
     (every? true? (map close-to-1 (map (fn [[start-pr st]] (apply + (map start-pr st))) (map vector start-prob states)))))
   "Start Probabilities are not sensible")
  (assert
   (let [states (map extract-states testing-and-training-sentences)
         observations (map extract-observations testing-and-training-sentences)
         prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
         transition-prob (map :prob-tag-given-previous-tag prob-map)
         emission-prob (map :prob-word-given-tag prob-map)
         start-prob (map #(partial % "START") transition-prob)
         testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
     (every? true? (map (fn [[emit-pr st wctl]] (every? true? (map close-to-1 (map (fn [word] (apply + (map #(emit-pr word %) st))) (keys wctl))))) (map vector emission-prob states word-count-training-list))))
   "Emmission probabilities are not sensible")
  (assert
   (every? true? (let [states (map extract-states testing-and-training-sentences)
                       observations (map extract-observations testing-and-training-sentences)
                       prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
                       transition-prob (map :prob-tag-given-previous-tag prob-map)
                       emission-prob (map :prob-word-given-tag prob-map)
                       start-prob (map #(partial % "START") transition-prob)
                       testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
                   (map (fn [[trans-pr st]] (every? true? (map close-to-1 (map (fn [prior] (apply + (map #(trans-pr % prior) st))) (disj st "."))))) (map vector transition-prob states))))
   "Transisition probabilities are not sensible"))



	======================================

	In/IN
	[ an/DT Oct./NNP 19/CD review/NN ]
	of/IN ``/``
	[ The/DT Misanthrope/NN ]
	''/'' at/IN
	[ Chicago/NNP 's/POS Goodman/NNP Theatre/NNP ]
	(/(
	[ ``/`` Revitalized/VBN Classics/NNS ]
	Take/VBP
	[ the/DT Stage/NN ]
	in/IN
	[ Windy/NNP City/NNP ]
	,/, ''/''
	[ Leisure/NN ]
	&/CC
	[ Arts/NNS ]
	)/) ,/,
	[ the/DT role/NN ]
	of/IN
	[ Celimene/NNP ]
	,/, played/VBN by/IN
	[ Kim/NNP Cattrall/NNP ]
	,/, was/VBD mistakenly/RB attributed/VBN to/TO
	[ Christina/NNP Haag/NNP ]
	./.

	[ Ms./NNP Haag/NNP ]
	plays/VBZ
	[ Elianti/NNP ]
	./.

	======================================

	(/( See/VB :/: ``/``
	[ Revitalized/VBN Classics/NNS ]
	Take/VBP
	[ the/DT Stage/NN ]
	in/IN
	[ Windy/NNP City/NNP ]
	''/'' --/:
	[ WSJ/NNP Oct./NNP 19/CD ]
	,/,
	[ 1989/CD ]

	)/)
	======================================




	======================================

	[ Rolls-Royce/NNP Motor/NNP Cars/NNPS Inc./NNP ]
	said/VBD
	[ it/PRP ]
	expects/VBZ
	[ its/PRP$ U.S./NNP sales/NNS ]
	to/TO remain/VB
	[ steady/JJ ]
	at/IN about/IN
	[ 1,200/CD cars/NNS ]
	in/IN
	[ 1990/CD ]
	./.

	[ The/DT luxury/NN auto/NN maker/NN last/JJ year/NN ]
	sold/VBD
	[ 1,214/CD cars/NNS ]
	in/IN
	[ the/DT U.S./NNP Howard/NNP Mosher/NNP ]
	,/,
	[ president/NN ]
	and/CC
	[ chief/JJ executive/NN officer/NN ]
	,/, said/VBD
	[ he/PRP ]
	anticipates/VBZ
	[ growth/NN ]
	for/IN
	[ the/DT luxury/NN auto/NN maker/NN ]
	in/IN
	[ Britain/NNP ]
	and/CC
	[ Europe/NNP ]
	,/, and/CC in/IN
	[ Far/JJ Eastern/JJ markets/NNS ]
	./.



	======================================

	[ BELL/NNP INDUSTRIES/NNP Inc./NNP ]
	increased/VBD
	[ its/PRP$ quarterly/NN ]
	to/TO
	[ 10/CD cents/NNS ]
	from/IN
	[ seven/CD cents/NNS ]

	[ a/DT share/NN ]
	./.

	[ The/DT new/JJ rate/NN ]
	will/MD be/VB
	[ payable/JJ Feb./NNP 15/CD ]
	./.

	[ A/DT record/NN date/NN has/VBZ n't/RB ]
	been/VBN set/VBN ./.

	[ Bell/NNP ]
	,/, based/VBN in/IN
	[ Los/NNP Angeles/NNP ]
	,/, makes/VBZ and/CC distributes/VBZ
	[ electronic/JJ ]
	,/,
	[ computer/NN ]
	and/CC
	[ building/NN products/NNS ]
	./.



	======================================

	[ Investors/NNS ]
	are/VBP appealing/VBG to/TO
	[ the/DT Securities/NNPS ]
	and/CC
	[ Exchange/NNP Commission/NNP ]
	not/RB to/TO limit/VB
	[ their/PRP$ access/NN ]
	to/TO
	[ information/NN ]
	about/IN
	[ stock/NN purchases/NNS ]
	and/CC
	[ sales/NNS ]
	by/IN
	[ corporate/JJ insiders/NNS ]
	./.

	======================================

	[ A/DT SEC/NNP proposal/NN ]
	to/TO ease/VB reporting/NN
	[ requirements/NNS ]
	for/IN
	[ some/DT company/NN executives/NNS ]
	would/MD undermine/VB
	[ the/DT usefulness/NN ]
	of/IN
	[ information/NN ]
	on/IN
	[ insider/NN trades/NNS ]
	as/IN
	[ a/DT stock-picking/JJ tool/NN ]
	,/,
	[ individual/JJ investors/NNS ]
	and/CC
	[ professional/JJ money/NN managers/NNS ]
	contend/VBP ./.
	(ns tagger.core
	"Running Viterbi on some text"
	(:require [clojure.string :as str]
	[clojure.contrib.generic.functor :as functor]
	[clojure.contrib.math :as math]
	[clojure.set :as set]
	[clojure.data :as data]
	[clojure.data.finger-tree :as ft]))

	(def ^:dynamic epsilon 0.01)

	(defn applyAll [fs x]
	(map #(% x) fs))

	(defn split-evenly [coll]
	(partition (quot (count coll) 10) coll))

	(defn nil?-zero [fn & args]
	(let [val (apply fn args)]
	(if (nil? val)
	0
	val)))

	;; Things to watch out for:
	;; There are 1942 sentences
	;; There are 44 escaped backslashes "\/"
	;; There are 5 tags that contain a \| which delimits two different tags into 1

	;; Counts needed are:
	;; word counts W
	;; tag counts T
	;; word-tag counts W-T
	;; previous tag to current tag counts T(i+1)-Ti

	;; Problem set
	(def str-to-tags (slurp "resources/treebank3_sect2.txt"))
	;; A sample set to play with
	;; (def str-to-tags (slurp "resources/sample.txt"))
	;; A sample set containing the tags with \|
	;; (def str-to-tags (slurp "resources/sample2.txt"))
	;; A sample set containing words with \/ (escaped /'s)
	;; (def str-to-tags (slurp "resources/sample3.txt"))
	;; A small sample set
	;; (def str-to-tags (slurp "resources/sample4.txt"))
	;; A quarter of the problem set
	;; (def str-to-tags (slurp "resources/sample5.txt"))
	;; 10 sentences
	;; (def str-to-tags (slurp "resources/sample6.txt"))

	(defn str->tags [string]
	(filter #(not (empty? %))
	(str/split string #"[\s]")))

	(defn tag->W-T [tag]
	"Converts a tag such as In/IN into a W-T such as [In IN]"
	(str/split tag #"[//]"))

	(defn sentence->tags [sentence]
	(map #(second %) sentence))

	(defn strip-tags [tag]
	(first tag))

	(defn this-and-that [xs]
	(map-indexed (fn [i x]
	[x (concat (take i xs)
	(drop (inc i) xs))])
	xs))

	(def cleaned-tag-str (filter #(= (count %) 2) (map tag->W-T (str->tags str-to-tags))))

	(defn split-sentences [tag-str]
	"Splits the str into sentences"
	(reduce #(if (= (second (first %2)) ".")
	(ft/conjr (pop %1) (ft/conjr (peek %1) (first %2)))
	(ft/conjr %1 %2))
	(ft/double-list)
	(map #(apply ft/double-list %) (partition-by #(= "." (second %)) tag-str))))

	(defn split-sentences-start-end [tag-str]
	"Splits the str into sentences with added start and end tags"
	(reduce #(if (= (second (first %2)) ".")
	(conj (vec (drop-last %1)) (conj (vec (conj (vec (conj (last %1) ["START" "START"]))
	(first %2))) ["END" "END"]))
	(conj (vec %1) %2))
	[]
	(partition-by #(= "." (second %)) tag-str)))

	(def sentences (split-sentences cleaned-tag-str))

	(def testing-and-training-sentences
	"A list containing 10 pairs of testing sentences and training sentences"
	(map (fn [[fst rst]] (ft/double-list fst (apply concat rst))) (this-and-that (split-evenly sentences))))

	(defn add-start-end [sentence-list]
	(map #(ft/consl (ft/conjr % ["END" "END"]) ["START" "START"]) sentence-list))

	(def testing-and-training-sentences-start-end (map #(map add-start-end %) testing-and-training-sentences))

	(def training-tag-list-start-end (map (comp #(map sentence->tags %) second) testing-and-training-sentences-start-end))

	(def testing-and-training-tag-list-start-end (map (fn [sample] (map #(map sentence->tags %) sample)) testing-and-training-sentences-start-end))

	(defn insert [m k]
	"Inserts a key k into a map m if it does not exist or increments the count if it does"
	(let [val (m k)]
	(assoc m k (inc (if (nil? val) 0 val)))))

	(defn nested-insert [m [word tag]]
	"Inserts a key k into a nested map m of tags and words if it does not exist or increments the count if it does"
	(let [val (get-in m [tag word])]
	(assoc-in m [tag word] (inc (if (nil? val) 0 val)))))

	(defn word-count [tagged-str]
	"Example of how to get word counts"
	(reduce #(insert %1 (first %2)) {} tagged-str))

	(defn tag-count [tagged-str]
	"Example of how to get tag counts"
	(reduce #(insert %1 (second %2)) {} tagged-str))

	(defn nested-tag-word-count [tagged-str]
	"Nested counts in the format of {tag {word count}}"
	(reduce #(nested-insert %1 %2) {} tagged-str))

	(def tag-count-training-list (map #(tag-count (apply concat (second %))) testing-and-training-sentences))

	(def word-count-training-list (map #(word-count (apply concat (second %))) testing-and-training-sentences))

	(def nested-tag-word-count-training-list (map #(nested-tag-word-count (apply concat (second %))) testing-and-training-sentences))

	(defn out-of-step-list [tag-list]
	"Creates a list of vector pairs where the second element is the next values first element"
	(map vector (rest tag-list) tag-list))

	(def training-previous-tag-tag-list-start-end (map #(map out-of-step-list %) training-tag-list-start-end))

	(def training-tag-count-start-end (map (comp frequencies flatten) training-tag-list-start-end))

	(defn nested-previous-tag-tag-count [previous-tag-tag-list]
	"Nested counts in the format of {prior-tag {tag count}}"
	(reduce #(nested-insert %1 %2) {} (apply concat previous-tag-tag-list)))

	(def nested-previous-tag-tag-count-training-list (map nested-previous-tag-tag-count training-previous-tag-tag-list-start-end))

	(defn unique-keys-count [m]
	(count (keys m)))

	(def unique-words-count-training-list (map count word-count-training-list))

	;; Calculating probabilities

	(defn make-prob-fn-map
	[[nested-t-w-count word-count unique-wc nested-prior-t-t-count tag-count-st-end unique-tc]]
	{:prob-word-given-tag ;; Construct Emission Probabilities
	(fn [word tag]
	(let [word-given-tag (nil?-zero get-in nested-t-w-count [tag word])
	tc (nil?-zero word-count word)]
	(/ (+ word-given-tag epsilon)
	(+ tc (* epsilon unique-tc)))))
	:prob-tag-given-previous-tag ;; Construct Transition Probabilities
	(fn [tag previous-tag]
	(let [tag-given-prior-tag-prob (nil?-zero get-in nested-prior-t-t-count [previous-tag tag])
	tc (nil?-zero tag-count-st-end previous-tag)]
	(/ (+ tag-given-prior-tag-prob epsilon)
	(+ tc (* epsilon unique-tc)))))})

	(defn viterbi-init [v path obs states start-p emit-p]
	"Initializes viterbi for us"
	(reduce
	#(into %1 {%2 [(* (start-p %2)
	(emit-p (first obs) %2))
	(conj path %2)]})
	{}
	states))

	(defn extract-prob-state [v path]
	"Extracts the current probability and state for a given [v path]"
	[(first (v path)) path])

	(defn viterbi-step [prior obs states trans-p emit-p]
	"Goes through one step of viterbi for us, taking a prior state and performing one step"
	(apply merge (map
	(comp (fn [[path v]] {(last path) [v path]}) #(apply max-key val %) #(apply merge %))
	((fn [obs]
	(map #(applyAll (map (comp (fn [[v past-st]]
	(fn [current-st]
	{(conj (second (prior past-st)) current-st)
	(* v (trans-p current-st past-st)
	(emit-p obs current-st))}))
	(partial extract-prob-state prior)) states) %) states))
	obs))))

	(defn viterbi [observations states start-prob trans-prob emit-prob]
	(let [init (viterbi-init [] [] observations states start-prob emit-prob)]
	(reduce #(viterbi-step %1 %2 states trans-prob emit-prob) init (rest observations))))

	(defn viterbi-solution [observations states start-prob trans-prob emit-prob]
	(apply max-key #(first (val %)) (viterbi observations states start-prob trans-prob emit-prob)))

	(defn extract-path [solution]
	"Extracts the path from a viterbi solution"
	(second (second solution)))

	(defn extract-tag-count [seq]
	(reduce insert {} (flatten (map second (second seq)))))

	(defn extract-states [seq]
	(into #{} (flatten (map #(map second %) (second seq)))))

	(defn extract-observations [seq]
	(map #(map first %) (first seq)))

	(defn extract-testing-tags [seq]
	(map #(map second %) (first seq)))

	(defn compare-matches [compare]
	"Compares vector containing a path and testing set and gives the matches"
	(map (comp (fn [m] (/ (nil?-zero m true) (+ (nil?-zero m true) (nil?-zero m false)))) frequencies (fn [[a b]] (map #(= %1 %2) a b))) compare))

	(defn average-accuracy [accuracy-scores]
	(/ (apply + accuracy-scores) (double (count accuracy-scores))))

	(defn run-viterbi []
	"Runs viterbi with transition and emission calculated using the same training data via cross validation"
	(let [states (map extract-states testing-and-training-sentences)
	observations (map extract-observations testing-and-training-sentences)
	prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
	transition-prob (map :prob-tag-given-previous-tag prob-map)
	emission-prob (map :prob-word-given-tag prob-map)
	start-prob (map (fn [trans-p] #(trans-p % "START")) transition-prob)
	testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
	(map #(map vector %1 %2)
	testing-tags-list
	(map (fn [[obs-list sts start-p trans-p emit-p]]
	(map (fn [obs]
	(extract-path (viterbi-solution obs sts start-p trans-p emit-p))) obs-list))
	(map vector observations states start-prob transition-prob emission-prob)))))

	(defn -main []
	(average-accuracy (map (comp average-accuracy compare-matches) (run-viterbi))))

	;; Checking functions

	(defn close-to-1 [val]
	(> 0.000001 (math/abs (- 1 val))))

	;; Assert that Probabilities are sensible?

	(defn check-probs? []
	(assert
	(let [states (map extract-states testing-and-training-sentences)
	observations (map extract-observations testing-and-training-sentences)
	prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
	transition-prob (map :prob-tag-given-previous-tag prob-map)
	emission-prob (map :prob-word-given-tag prob-map)
	start-prob (map (fn [trans-p] #(trans-p % "START")) transition-prob)
	testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
	(every? true? (map close-to-1 (map (fn [[start-pr st]] (apply + (map start-pr st))) (map vector start-prob states)))))
	"Start Probabilities are not sensible")
	(assert
	(let [states (map extract-states testing-and-training-sentences)
	observations (map extract-observations testing-and-training-sentences)
	prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
	transition-prob (map :prob-tag-given-previous-tag prob-map)
	emission-prob (map :prob-word-given-tag prob-map)
	start-prob (map #(partial % "START") transition-prob)
	testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
	(every? true? (map (fn [[emit-pr st wctl]] (every? true? (map close-to-1 (map (fn [word] (apply + (map #(emit-pr word %) st))) (keys wctl))))) (map vector emission-prob states word-count-training-list))))
	"Emmission probabilities are not sensible")
	(assert
	(every? true? (let [states (map extract-states testing-and-training-sentences)
	observations (map extract-observations testing-and-training-sentences)
	prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
	transition-prob (map :prob-tag-given-previous-tag prob-map)
	emission-prob (map :prob-word-given-tag prob-map)
	start-prob (map #(partial % "START") transition-prob)
	testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
	(map (fn [[trans-pr st]] (every? true? (map close-to-1 (map (fn [prior] (apply + (map #(trans-pr % prior) st))) (disj st "."))))) (map vector transition-prob states))))
	"Transisition probabilities are not sensible"))