Created
March 5, 2015 03:09
-
-
Save veer66/8055128d63bbc609129d to your computer and use it in GitHub Desktop.
A simple dictionary-based Thai word segmentation program written in Haskell
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{- | |
Copyright (c) 2015, Vee Satayamas | |
All rights reserved. | |
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: | |
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. | |
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. | |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
-} | |
-- A simple dictionary-based Thai word segmentation program written in Haskell | |
-- Unknown words, non-Thai words are still not handled properly | |
-- tdict-std.txt can be obtained from http://github.com/veer66/wordcut | |
import Text.Show | |
import Data.List | |
import Data.Array | |
import System.IO | |
import Data.Maybe | |
import Debug.Trace | |
data DictAcceptor l r final offset = DictAcceptor Int Int Bool Int deriving (Show) | |
data PathInfo i w unk p = PathInfo Int Int Int Int deriving (Show) | |
load_dict = do | |
filedata <-readFile "tdict-std.txt" | |
return $ lines filedata | |
data SeekPolicy = LEFT | RIGHT | |
dict_seek :: SeekPolicy -> [String] -> Int -> Int -> Int -> Char -> Maybe Int | |
dict_seek policy wlst l r offset ch = seek l r Nothing where | |
seek :: Int -> Int -> Maybe Int -> Maybe Int | |
seek l r a = if l > r then | |
a | |
else | |
let m = (l+r) `div` 2 | |
w = wlst !! m | |
wlen = length w | |
in if wlen <= offset then | |
seek (m+1) r a | |
else | |
let ch_ = w !! offset | |
in if ch_ < ch then | |
seek (m+1) r a | |
else | |
if ch_ > ch then | |
seek l (m-1) a | |
else | |
let a_ = Just m | |
in case policy of | |
LEFT -> seek l (m-1) (Just m) | |
RIGHT -> seek (m+1) r (Just m) | |
transit_each :: [String] -> Char -> DictAcceptor l r final offset -> Maybe (DictAcceptor l r final offset) | |
transit_each wlst ch a = | |
_transit a where | |
_transit (DictAcceptor l r final offset) = | |
let m_l = dict_seek LEFT wlst l r offset ch | |
in if isNothing m_l then | |
Nothing | |
else | |
let _l = fromJust m_l | |
in let m_r = dict_seek RIGHT wlst _l r offset ch | |
in if isNothing m_r then | |
Nothing | |
else | |
let _r = fromJust m_r | |
_offset = offset +1 | |
w = wlst !! _l | |
is_final = _offset == wlen | |
wlen = length w | |
in Just $ DictAcceptor _l _r is_final _offset | |
transit :: [String] -> Int -> [DictAcceptor l r final offset] -> Char -> [DictAcceptor l r final offset] | |
transit wlst wllen acc ch = map fromJust sel_acc where | |
sel_acc = filter isJust sel_macc | |
sel_macc = map _transit_each new_acc | |
_transit_each = transit_each wlst ch | |
new_acc = new_a:acc | |
new_a = DictAcceptor 0 (wllen-1) False 0 | |
getFinals acc = filter is_final acc where | |
is_final (DictAcceptor l r final offset) = final | |
buildCandidatesByAcc i finals paths = map f finals where | |
f (DictAcceptor l r final offset) = | |
let p = i - offset + 1 | |
(PathInfo j pw punk _) = paths !! p | |
w_ = pw + 1 | |
in PathInfo i w_ punk p | |
buildCandidates i finals paths = | |
let final_count = length finals | |
in if final_count == 0 then | |
[PathInfo i 10 10 0] | |
else | |
buildCandidatesByAcc i finals paths | |
instance Eq (PathInfo i w unk p) where | |
(PathInfo _ w0 unk0 _) == (PathInfo _ w1 unk1 _) = unk0 == unk1 | |
instance Ord (PathInfo i w unk p) where | |
compare (PathInfo _ w0 unk0 _) (PathInfo _ w1 unk1 _ ) = | |
let unk_cmp = compare unk0 unk1 in | |
if unk_cmp == EQ then | |
unk_cmp | |
else | |
compare w0 w1 | |
data TextRange s e = TextRange Int Int deriving (Show) | |
path_to_ranges pathinfos = | |
let e = (length pathinfos) - 1 | |
in iter [] e where | |
iter ranges 0 = ranges | |
iter ranges e = (iter _ranges p) where | |
_ranges = (TextRange p e) : ranges | |
PathInfo i w unk p = pathinfos !! e | |
find_path :: [String] -> Int -> String -> [PathInfo i w unk p] | |
find_path wlst wllen txt = iter txtlen [PathInfo (-1) 0 0 (-1)] [] where | |
txtlen = length txt | |
iter 0 paths acc = paths | |
iter n paths acc = iter (n-1) _paths _acc where | |
i = txtlen - n | |
_acc = _transit acc ch | |
_transit = transit wlst wllen | |
ch = txt !! i | |
_paths = paths ++ [selected_path] | |
selected_path = minimum candidates | |
candidates = buildCandidates i finals paths | |
finals = getFinals _acc | |
range_to_text :: String -> TextRange s e -> String | |
range_to_text txt r = drop s rear where | |
rear = take e txt | |
TextRange s e = r | |
segment :: [String] -> Int -> String -> [String] | |
segment wlst wllen txt = map _range_to_text ranges where | |
_range_to_text = range_to_text txt | |
ranges = path_to_ranges pathinfos | |
pathinfos = find_path wlst wllen txt | |
main = do | |
wlst <- load_dict | |
let wllen = length wlst | |
txt <- getLine | |
let words = segment wlst wllen txt | |
mapM_ putStrLn words |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment