Skip to content

Instantly share code, notes, and snippets.

@veer66
Created March 5, 2015 03:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save veer66/8055128d63bbc609129d to your computer and use it in GitHub Desktop.
Save veer66/8055128d63bbc609129d to your computer and use it in GitHub Desktop.
A simple dictionary-based Thai word segmentation program written in Haskell
{-
Copyright (c) 2015, Vee Satayamas
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-}
-- A simple dictionary-based Thai word segmentation program written in Haskell
-- Unknown words, non-Thai words are still not handled properly
-- tdict-std.txt can be obtained from http://github.com/veer66/wordcut
import Text.Show
import Data.List
import Data.Array
import System.IO
import Data.Maybe
import Debug.Trace
data DictAcceptor l r final offset = DictAcceptor Int Int Bool Int deriving (Show)
data PathInfo i w unk p = PathInfo Int Int Int Int deriving (Show)
load_dict = do
filedata <-readFile "tdict-std.txt"
return $ lines filedata
data SeekPolicy = LEFT | RIGHT
dict_seek :: SeekPolicy -> [String] -> Int -> Int -> Int -> Char -> Maybe Int
dict_seek policy wlst l r offset ch = seek l r Nothing where
seek :: Int -> Int -> Maybe Int -> Maybe Int
seek l r a = if l > r then
a
else
let m = (l+r) `div` 2
w = wlst !! m
wlen = length w
in if wlen <= offset then
seek (m+1) r a
else
let ch_ = w !! offset
in if ch_ < ch then
seek (m+1) r a
else
if ch_ > ch then
seek l (m-1) a
else
let a_ = Just m
in case policy of
LEFT -> seek l (m-1) (Just m)
RIGHT -> seek (m+1) r (Just m)
transit_each :: [String] -> Char -> DictAcceptor l r final offset -> Maybe (DictAcceptor l r final offset)
transit_each wlst ch a =
_transit a where
_transit (DictAcceptor l r final offset) =
let m_l = dict_seek LEFT wlst l r offset ch
in if isNothing m_l then
Nothing
else
let _l = fromJust m_l
in let m_r = dict_seek RIGHT wlst _l r offset ch
in if isNothing m_r then
Nothing
else
let _r = fromJust m_r
_offset = offset +1
w = wlst !! _l
is_final = _offset == wlen
wlen = length w
in Just $ DictAcceptor _l _r is_final _offset
transit :: [String] -> Int -> [DictAcceptor l r final offset] -> Char -> [DictAcceptor l r final offset]
transit wlst wllen acc ch = map fromJust sel_acc where
sel_acc = filter isJust sel_macc
sel_macc = map _transit_each new_acc
_transit_each = transit_each wlst ch
new_acc = new_a:acc
new_a = DictAcceptor 0 (wllen-1) False 0
getFinals acc = filter is_final acc where
is_final (DictAcceptor l r final offset) = final
buildCandidatesByAcc i finals paths = map f finals where
f (DictAcceptor l r final offset) =
let p = i - offset + 1
(PathInfo j pw punk _) = paths !! p
w_ = pw + 1
in PathInfo i w_ punk p
buildCandidates i finals paths =
let final_count = length finals
in if final_count == 0 then
[PathInfo i 10 10 0]
else
buildCandidatesByAcc i finals paths
instance Eq (PathInfo i w unk p) where
(PathInfo _ w0 unk0 _) == (PathInfo _ w1 unk1 _) = unk0 == unk1
instance Ord (PathInfo i w unk p) where
compare (PathInfo _ w0 unk0 _) (PathInfo _ w1 unk1 _ ) =
let unk_cmp = compare unk0 unk1 in
if unk_cmp == EQ then
unk_cmp
else
compare w0 w1
data TextRange s e = TextRange Int Int deriving (Show)
path_to_ranges pathinfos =
let e = (length pathinfos) - 1
in iter [] e where
iter ranges 0 = ranges
iter ranges e = (iter _ranges p) where
_ranges = (TextRange p e) : ranges
PathInfo i w unk p = pathinfos !! e
find_path :: [String] -> Int -> String -> [PathInfo i w unk p]
find_path wlst wllen txt = iter txtlen [PathInfo (-1) 0 0 (-1)] [] where
txtlen = length txt
iter 0 paths acc = paths
iter n paths acc = iter (n-1) _paths _acc where
i = txtlen - n
_acc = _transit acc ch
_transit = transit wlst wllen
ch = txt !! i
_paths = paths ++ [selected_path]
selected_path = minimum candidates
candidates = buildCandidates i finals paths
finals = getFinals _acc
range_to_text :: String -> TextRange s e -> String
range_to_text txt r = drop s rear where
rear = take e txt
TextRange s e = r
segment :: [String] -> Int -> String -> [String]
segment wlst wllen txt = map _range_to_text ranges where
_range_to_text = range_to_text txt
ranges = path_to_ranges pathinfos
pathinfos = find_path wlst wllen txt
main = do
wlst <- load_dict
let wllen = length wlst
txt <- getLine
let words = segment wlst wllen txt
mapM_ putStrLn words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment