Skip to content

Instantly share code, notes, and snippets.

@pierric
Last active February 14, 2016 07:19
Show Gist options
  • Save pierric/ef52d3bfbadd03c538fa to your computer and use it in GitHub Desktop.
Save pierric/ef52d3bfbadd03c538fa to your computer and use it in GitHub Desktop.
The viterbi algorithm for HMM in Haskell language
data Tag = TagStart | TagStop | Tag Int deriving (Eq, Show)
type Word = String
class CorpusModel m where
tags :: m -> [Tag]
tag_name :: m -> Tag -> String
tag_count_u :: m -> Tag -> Int
tag_count_b :: m -> (Tag, Tag) -> Int
tag_count_t :: m -> (Tag, Tag, Tag) -> Int
word_tag_count :: m -> Word -> Tag -> Int
viterbi :: CorpusModel m => m -> [Word] -> [Tag]
viterbi m s = let -- list all possible ends tag pairs, and find the maximum one.
candidates = [(taccum (len+1) t1 t2 TagStop, (t1, t2)) | t1 <- validtags, t2 <- validtags]
(_, (t1, t2)) = maximumBy value_compare candidates
-- utility step function that extract the previous tag
-- from tag pair (t1,t2) at step n.
step (n,t1,t2) = fromJust $ snd $ pi_cached !# n !# (t1,t2)
-- repeat the above step utils the full path of tags are extracted,
-- note that this path is reversed.
path = t2 : t1 : map step (zip3 (reverse [3..len]) (tail path) path)
in reverse path
where
validtags = tags m
len = length s
min_tag = TagStart
max_tag = maximum validtags
tag_range = ((min_tag,min_tag),(max_tag,max_tag))
sentence = listArray (1,len) s
-- word probability
wprob n = let cnt = word_tag_count m (sentence !# n)
in \t -> (fromIntegral (cnt t) / fromIntegral (tag_count_u m t)) :: Double
-- one-step transition probability
tprob t1 t2 t3 = (fromIntegral (tag_count_t m (t1,t2,t3)) / fromIntegral (tag_count_b m (t1,t2))):: Double
-- accumulated transition probability
taccum 1 t1 t2 t3 = tprob t1 t2 t3
taccum n t1 t2 t3 = get_value (pi_cached !# (n-1) !# (t1, t2)) * tprob t1 t2 t3
pi :: Int -> (Tag,Tag) -> (Double, Maybe Tag)
pi n (t2,t3) | t3 `elem` [TagStart, TagStop] = (0, Nothing)
| n == 1 && t2 /= TagStart = (0, Nothing)
| n > 1 && t2 `elem` [TagStart, TagStop] = (0, Nothing)
-- (n = 1 ==> t2 = TagStart) && (n > 1 ==> t2 /= TagStart,TagStop)
| otherwise = let wprobn = wprob n
in if n <= 2 then
(taccum n TagStart t2 t3 * wprobn t3, Just TagStart)
else
let candidates = [(taccum n t1 t2 t3 * wprobn t3, Just t1) | t1 <- validtags]
in maximumBy value_compare candidates
-- cached pi values
-- by the lazy semantics, each value is computed only when it is necessary and always computed only once.
pi_cached :: Array Int (Array (Tag,Tag) (Double, Maybe Tag))
pi_cached = listArray (1,len)
( flip map [1..len] (\n ->
listArray tag_range $ map (pi n) (range tag_range)))
get_value (v, _) = v
value_compare a b = compare (get_value a) (get_value b)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment