pierric/viterbi.hs

## viterbi.hs
data Tag  = TagStart | TagStop | Tag Int deriving (Eq, Show)
type Word = String

class CorpusModel m where
    tags           :: m -> [Tag]
    tag_name       :: m -> Tag -> String
    tag_count_u    :: m -> Tag -> Int
    tag_count_b    :: m -> (Tag, Tag) -> Int
    tag_count_t    :: m -> (Tag, Tag, Tag) -> Int
    word_tag_count :: m -> Word -> Tag -> Int

viterbi :: CorpusModel m => m -> [Word] -> [Tag]
viterbi m s = let -- list all possible ends tag pairs, and find the maximum one.
                  candidates     = [(taccum (len+1) t1 t2 TagStop, (t1, t2)) | t1 <- validtags, t2 <- validtags]
                  (_, (t1, t2))  = maximumBy value_compare candidates
                  -- utility step function that extract the previous tag
                  -- from tag pair (t1,t2) at step n.
                  step (n,t1,t2) = fromJust $ snd $ pi_cached !# n !# (t1,t2)
                  -- repeat the above step utils the full path of tags are extracted,
                  -- note that this path is reversed.
                  path = t2 : t1 : map step (zip3 (reverse [3..len]) (tail path) path)
              in reverse path
  where
    validtags = tags m
    len       = length s
    min_tag   = TagStart
    max_tag   = maximum validtags
    tag_range = ((min_tag,min_tag),(max_tag,max_tag))
    sentence  = listArray (1,len) s

    -- word probability
    wprob n = let cnt = word_tag_count m (sentence !# n)
              in \t -> (fromIntegral (cnt t) / fromIntegral (tag_count_u m t))  :: Double

    -- one-step transition probability
    tprob t1 t2 t3 = (fromIntegral (tag_count_t m (t1,t2,t3)) / fromIntegral (tag_count_b m (t1,t2))):: Double
    -- accumulated transition probability
    taccum 1 t1 t2 t3 = tprob t1 t2 t3
    taccum n t1 t2 t3 = get_value (pi_cached !# (n-1) !# (t1, t2)) * tprob t1 t2 t3

    pi :: Int -> (Tag,Tag) -> (Double, Maybe Tag)
    pi n (t2,t3) | t3 `elem` [TagStart, TagStop]           = (0, Nothing)
                 | n == 1 && t2 /= TagStart                = (0, Nothing)
                 | n >  1 && t2 `elem` [TagStart, TagStop] = (0, Nothing)
                 -- (n = 1 ==> t2 = TagStart) && (n > 1 ==> t2 /= TagStart,TagStop)
                 | otherwise = let wprobn = wprob n
                               in if n <= 2 then
                                    (taccum n TagStart t2 t3 * wprobn t3, Just TagStart)
                                  else
                                    let candidates = [(taccum n t1 t2 t3 * wprobn t3, Just t1) | t1 <- validtags]
                                    in maximumBy value_compare candidates
    -- cached pi values
    -- by the lazy semantics, each value is computed only when it is necessary and always computed only once.
    pi_cached :: Array Int (Array (Tag,Tag) (Double, Maybe Tag))
    pi_cached  = listArray (1,len)
                   ( flip map [1..len] (\n ->
                        listArray tag_range $ map (pi n) (range tag_range)))

    get_value (v, _)      = v
    value_compare a b     = compare (get_value a) (get_value b)
	data Tag = TagStart \| TagStop \| Tag Int deriving (Eq, Show)
	type Word = String

	class CorpusModel m where
	tags :: m -> [Tag]
	tag_name :: m -> Tag -> String
	tag_count_u :: m -> Tag -> Int
	tag_count_b :: m -> (Tag, Tag) -> Int
	tag_count_t :: m -> (Tag, Tag, Tag) -> Int
	word_tag_count :: m -> Word -> Tag -> Int

	viterbi :: CorpusModel m => m -> [Word] -> [Tag]
	viterbi m s = let -- list all possible ends tag pairs, and find the maximum one.
	candidates = [(taccum (len+1) t1 t2 TagStop, (t1, t2)) \| t1 <- validtags, t2 <- validtags]
	(_, (t1, t2)) = maximumBy value_compare candidates
	-- utility step function that extract the previous tag
	-- from tag pair (t1,t2) at step n.
	step (n,t1,t2) = fromJust $ snd $ pi_cached !# n !# (t1,t2)
	-- repeat the above step utils the full path of tags are extracted,
	-- note that this path is reversed.
	path = t2 : t1 : map step (zip3 (reverse [3..len]) (tail path) path)
	in reverse path
	where
	validtags = tags m
	len = length s
	min_tag = TagStart
	max_tag = maximum validtags
	tag_range = ((min_tag,min_tag),(max_tag,max_tag))
	sentence = listArray (1,len) s

	-- word probability
	wprob n = let cnt = word_tag_count m (sentence !# n)
	in \t -> (fromIntegral (cnt t) / fromIntegral (tag_count_u m t)) :: Double

	-- one-step transition probability
	tprob t1 t2 t3 = (fromIntegral (tag_count_t m (t1,t2,t3)) / fromIntegral (tag_count_b m (t1,t2))):: Double
	-- accumulated transition probability
	taccum 1 t1 t2 t3 = tprob t1 t2 t3
	taccum n t1 t2 t3 = get_value (pi_cached !# (n-1) !# (t1, t2)) * tprob t1 t2 t3

	pi :: Int -> (Tag,Tag) -> (Double, Maybe Tag)
	pi n (t2,t3) \| t3 `elem` [TagStart, TagStop] = (0, Nothing)
	\| n == 1 && t2 /= TagStart = (0, Nothing)
	\| n > 1 && t2 `elem` [TagStart, TagStop] = (0, Nothing)
	-- (n = 1 ==> t2 = TagStart) && (n > 1 ==> t2 /= TagStart,TagStop)
	\| otherwise = let wprobn = wprob n
	in if n <= 2 then
	(taccum n TagStart t2 t3 * wprobn t3, Just TagStart)
	else
	let candidates = [(taccum n t1 t2 t3 * wprobn t3, Just t1) \| t1 <- validtags]
	in maximumBy value_compare candidates
	-- cached pi values
	-- by the lazy semantics, each value is computed only when it is necessary and always computed only once.
	pi_cached :: Array Int (Array (Tag,Tag) (Double, Maybe Tag))
	pi_cached = listArray (1,len)
	( flip map [1..len] (\n ->
	listArray tag_range $ map (pi n) (range tag_range)))

	get_value (v, _) = v
	value_compare a b = compare (get_value a) (get_value b)