Parsec: RFC 5322 (RFC 822) email address parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Debug.Trace | |
import Control.Monad.State | |
import Text.Parsec | |
import Data.Char | |
import Control.Monad.Trans | |
import Control.Applicative hiding((<|>),many,optional) | |
-- trace s f = f | |
-- ref. RFC5322 and RFC5234 | |
--debug_putStrLn = putStrLn | |
debug_putStrLn = return | |
debug_print s0 = liftIO $ debug_putStrLn $ s0 | |
debug_in s0 = debug_print $ "[IN] " ++ s0 | |
debug_return s0 s1 = do | |
liftIO $ debug_putStrLn $ "[OUT] " ++ s0 ++ " : [" ++ s1 ++ "]" | |
return s1 | |
-- address = mailbox / group | |
address :: ParsecT [Char] u IO [Char] | |
address = do | |
debug_in "address" | |
s0 <- mailbox <|> group | |
-- debug_return "address" s0 | |
return s0 | |
-- mailbox = name-addr / addr-spec | |
mailbox = do | |
debug_in "mailbox" | |
s0 <- try(name_addr) <|> addr_spec | |
debug_return "mailbox" s0 | |
-- name-addr = [display-name] angle-addr | |
name_addr = do | |
debug_in "name_addr" | |
s0 <- option "" display_name | |
s1 <- angle_addr | |
liftIO $ putStrLn $ " DISPLAY_NAME: " ++ s0 | |
liftIO $ putStrLn $ " ADDDR_SPEC: " ++ s1 | |
debug_return "name_addr" $ s0 ++ s1 | |
-- angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / | |
-- obs-angle-addr | |
angle_addr = do | |
debug_in "angle_addr" | |
s0 <- option "" cfws | |
s1 <- char '<' | |
s2 <- addr_spec | |
s3 <- char '>' | |
s4 <- option "" cfws | |
liftIO $ putStrLn $ " ADDDR_SPEC: " ++ s2 | |
debug_return "angle_addr" $ s0 ++ [s1] ++ s2 ++ [s3] ++ s4 | |
<|> | |
obs_angle_addr | |
-- group = display-name ":" [group-list] ";" [CFWS] | |
group = do | |
debug_in "group" | |
s0 <- display_name | |
s1 <- char ':' | |
s2 <- option "" group_list | |
s3 <- char ';' | |
s4 <- option "" cfws | |
debug_return "group" $ s0 ++ [s1] ++ s2 ++ [s3] ++ s4 | |
-- display-name = phrase | |
display_name = do | |
debug_in "display_name" | |
s0 <- phrase | |
debug_return "display_name" s0 | |
-- mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list | |
mailbox_list :: ParsecT [Char] u IO [Char] | |
mailbox_list = do { | |
s0 <- mailbox ; | |
s1 <- many (do | |
s0 <- char ',' | |
s1 <- mailbox | |
return $ [s0] ++ s1 | |
); | |
debug_return "mailbox_list" $ s0 ++ (concat s1) | |
} <|> | |
obs_mbox_list | |
-- address-list = (address *("," address)) / obs-addr-list | |
{- | |
address_list :: ParsecT [Char] u IO [Char] | |
address_list = try(do { | |
address ; | |
many (char ',' >> address) | |
} ) <|> | |
obs_addr_list | |
-} | |
-- group-list = mailbox-list / CFWS / obs-group-list | |
group_list = do | |
mailbox_list | |
<|> | |
cfws | |
<|> | |
obs_group_list | |
-- addr-spec = local-part "@" domain | |
addr_spec = do | |
debug_in "addr_spec" | |
s0 <- local_part | |
s1 <- char '@' | |
s2 <- domain | |
liftIO $ putStrLn $ " LOCAL-PART: " ++ s0 | |
liftIO $ putStrLn $ " DOMAIN: " ++ s2 | |
debug_return "addr_spec" $ s0 ++ [s1] ++ s2 | |
-- local-part = dot-atom / quoted-string / obs-local-part | |
local_part = do | |
debug_in "local_part" | |
s0 <- (try dot_atom) <|> try quoted_string <|> obs_local_part | |
debug_return "local_part" s0 | |
-- domain = dot-atom / domain-literal / obs-domain | |
domain = do | |
debug_in "domain" | |
s0 <- try dot_atom <|> try domain_literal <|> obs_domain | |
debug_return "domain" s0 | |
domain_literal = do | |
s0 <- option "" cfws | |
s1 <- char '[' | |
s2 <- many (do | |
s0 <- option "" fws | |
s1 <- dtext | |
return $ s0 ++ s1 | |
) | |
s3 <- option "" fws | |
s4 <- char ']' | |
s5 <- option "" cfws | |
debug_return "domain_literal" $ s0 ++ [s1] ++ (concat s2) ++ s3 ++ [s4] ++ s5 | |
dtext = do | |
debug_in "dtext" | |
s0 <- do { | |
s0 <- oneOf $ map chr ([33..90] ++ [94..126]); | |
return [s0] | |
}<|> obs_dtext | |
debug_return "dtext" s0 | |
-- CFWS = (1*([FWS] comment) [FWS]) / FWS | |
cfws :: ParsecT [Char] u IO [Char] | |
cfws = do | |
debug_in "cfws" | |
s0 <- try(do { | |
s0 <- many1 (do | |
s0 <- option "" fws | |
s1 <- comment | |
return $ s0 ++ s1 | |
); | |
s1 <- option "" fws; | |
return $ (concat s0) ++ s1 | |
} ) <|> fws | |
-- debug_return "cfws" s0 | |
debug_return "cfws" "" | |
-- FWS = ([*WSP CRLF] 1*WSP) / obs-FWS | |
fws :: ParsecT [Char] u IO [Char] | |
fws = do | |
debug_in "fws" | |
s0 <- try(do { | |
s0 <- try (option "" (do | |
s0 <- many wsp | |
s1 <- crlf | |
return $ s0 ++ s1 | |
)); | |
s1 <- many1 wsp; | |
return $ s0 ++ s1 | |
} ) <|> obs_fws | |
debug_return "fws" s0 | |
wsp = do | |
s1 <- char ' ' <|> char '\t' <?> "@wsp" | |
-- debug_print "wsp" $ "'" ++ [s1] ++ "'" | |
return s1 | |
-- obs-FWS = 1*WSP *(CRLF 1*WSP) | |
obs_fws = do | |
s0 <- many1 wsp | |
s1 <- many (do | |
s0 <- crlf | |
s1 <- many1 wsp | |
return $ s0 ++ s1 | |
) | |
debug_return "OBS_fws" $ s0 ++ (concat s1) | |
crlf = do | |
s1 <- ( do | |
s0 <- char '\r' | |
s1 <- char '\n' | |
return $ [s0, s1] | |
) <?> "@crlf" | |
debug_return "crlf" s1 | |
ctext = | |
try (oneOf $ map chr ([33..39] ++ [42..91] ++ [93..126])) | |
<|> | |
obs_ctext | |
-- ccontent = ctext / quoted-pair / comment | |
ccontent :: ParsecT [Char] u IO [Char] | |
ccontent = do | |
debug_in "ccontent" | |
s0 <- ((try ctext) >>= return . pure) | |
<|> | |
(try quoted_pair) | |
<|> | |
comment | |
debug_return "ccontent" s0 | |
-- comment = "(" *([FWS] ccontent) [FWS] ")" | |
comment = do | |
debug_in "comment" | |
s0 <- char '(' | |
s1 <- many (do | |
s0 <- option "" fws | |
s1 <- ccontent | |
return $ s0 ++ s1 | |
) | |
s2 <- option "" fws | |
s3 <- char ')' | |
debug_return "comment" $ [s0] ++ (concat s1) ++ s2 ++ [s3] | |
-- obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS] | |
obs_angle_addr = do | |
s0 <- option "" cfws | |
s1 <- char '<' | |
s2 <- obs_route | |
s3 <- addr_spec | |
s4 <- char '>' | |
s5 <- option "" cfws | |
debug_return "OBS_angle_addr" $ s0 ++ [s1] ++ s2 ++ s3 ++ [s4] ++ s5 | |
-- obs-route = obs-domain-list ":" | |
obs_route = do | |
s0 <- obs_domain_list | |
s1 <- char ':' | |
debug_return "OBS_route" $ s0 ++ [s1] | |
-- obs-domain-list = *(CFWS / ",") "@" domain | |
-- *("," [CFWS] ["@" domain]) | |
obs_domain_list = do | |
s0 <- many (cfws <|> (char ',' >> return ",")) | |
s1 <- char '@' | |
s2 <- domain | |
s3 <- many (do | |
s0 <- char ',' | |
s1 <- option "" cfws | |
s2 <- option "" (do | |
s0 <- char '@' | |
s1 <- domain | |
return $ [s0] ++ s1) | |
return $ [s0] ++ s1 ++ s2) | |
debug_return "OBS_domain_list" $ (concat s0) ++ [s1] ++ s2 ++ (concat s3) | |
-- obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]) | |
obs_mbox_list = do | |
s0 <- many (do | |
s0 <- cfws | |
s1 <- char ',' | |
return $ s0 ++ [s1] | |
) | |
s1 <- mailbox | |
s2 <- many (do | |
s0 <- char ',' | |
s1 <- option "" (mailbox <|> cfws) | |
return $ [s0] ++ s1 | |
) | |
debug_return "OBS_mbox_list" $ (concat s0) ++ s1 ++ (concat s2) | |
-- obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]) | |
{- | |
obs_addr_list :: ParsecT [Char] u IO [Char] | |
obs_addr_list = do | |
s0 <- many (do | |
s0 <- option "" cfws | |
s1 <- char ',' | |
return $ s0 ++ [s1] | |
) | |
s1 <- address | |
s2 <- many (do | |
s0 <- char ',' | |
s1 <- option "" (address <|> cfws) | |
return $ [s0] ++ s1) | |
return $ (concat s0) ++ s1 ++ (concat s2) | |
-} | |
-- obs-group-list = 1*([CFWS] ",") [CFWS] | |
obs_group_list = do | |
s0 <- many1 (do | |
s0 <- option "" cfws | |
s1 <- char ',' | |
return $ s0 ++ [s1] | |
) | |
s1 <- option "" cfws | |
return $ (concat s0) ++ s1 | |
-- obs-local-part = word *("." word) | |
obs_local_part = do | |
s0 <- word | |
s1 <- many (do | |
s0 <- char '.' | |
s1 <- word | |
return $ [s0] ++ s1 | |
) | |
return $ s0 ++ (concat s1) | |
-- obs-domain = atom *("." atom) | |
obs_domain = do | |
s0 <- atom | |
s1 <- many (char '.' >> atom) | |
return $ s0 ++ (concat s1) | |
-- obs-dtext = obs-NO-WS-CTL / quoted-pair | |
obs_dtext :: ParsecT [Char] u IO [Char] | |
obs_dtext = do | |
debug_in "OBS_dtext" | |
(obs_NO_WS_CTL >>= return . pure) | |
<|> | |
quoted_pair | |
word :: ParsecT [Char] u IO [Char] | |
word = do | |
debug_in "word" | |
s0 <- try atom <|> quoted_string | |
debug_return "word" s0 | |
phrase = do | |
debug_in "phrase" | |
s0 <- (try(many1 word)) <|> (obs_phrase >>= return . pure) | |
debug_return "phrase" $ concat s0 | |
atext = do | |
s0 <- (oneOf (['a'..'z'] ++ ['A'..'Z']) | |
<|> | |
oneOf ['0'..'9'] | |
<|> | |
oneOf "!#$%&'*+-/=?^_`{|}~") | |
debug_print $ "[OUT] atext: " ++ [s0] | |
return s0 | |
-- atom = [CFWS] 1*atext [CFWS] | |
atom = do | |
debug_in "atom" | |
s0 <- option "" cfws | |
s1 <- many1 atext | |
s2 <- option "" cfws | |
debug_return "atom" $ s0 ++ s1 ++ s2 | |
-- dot-atom = [CFWS] dot-atom-text [CFWS] | |
dot_atom = do | |
debug_in "dot_atom" | |
s0 <- option "" cfws | |
s1 <- dot_atom_text | |
s2 <- option "" cfws | |
--debug_return "dot_atom" $ "[-" ++ s0 ++ "-]" ++ s1 ++ "[-" ++ s2 ++ "-]" | |
debug_return "dot_atom" $ s0 ++ s1 ++ s2 | |
--debug_return "dot_atom" s1 | |
-- dot-atom-text = 1*atext *("." 1*atext) | |
dot_atom_text = do | |
debug_in "dot_atom_text" | |
s0 <- many1 atext | |
s1 <- many (do | |
s0 <- char '.' | |
s1 <- many1 atext | |
return $ [s0] ++ s1 | |
) | |
debug_return "dot_atom_text" $ s0 ++ (concat s1) | |
qtext = | |
(try(oneOf $ map chr $ [33] ++ [35..91] ++ [93..126]) | |
<|> | |
obs_qtext) >>= (debug_return "qtext" . pure) | |
qcontent :: ParsecT [Char] u IO [Char] | |
qcontent = qtext <|> quoted_pair | |
quoted_pair :: ParsecT [Char] u IO [Char] | |
quoted_pair = | |
(do | |
s0 <- char '\\' | |
s1 <- vchar <|> wsp | |
return $ [s0,s1] | |
) | |
<|> obs_qp | |
quoted_string :: ParsecT [Char] u IO [Char] | |
quoted_string = do | |
debug_in "quoted_string" | |
s0 <- option "" cfws | |
s1 <- char '"' | |
s2 <- many (do | |
s1 <- option "" fws | |
s2 <- qcontent | |
return $ s1 ++ s2 | |
) | |
s3 <- option "" fws | |
s4 <- char '"' | |
s5 <- option "" cfws | |
debug_return "quoted_string" $ s0 ++ [s1] ++ (concat s2) ++ s3 ++ [s4] ++ s5 | |
vchar = oneOf $ map chr [0x21..0x7e] | |
obs_NO_WS_CTL = do | |
s0 <- (oneOf $ map chr ([1..8] ++ [11] ++ [12] ++ [14..31] ++ [127])) | |
return s0 | |
obs_ctext = obs_NO_WS_CTL | |
obs_qtext = obs_NO_WS_CTL | |
obs_qp = do | |
s0 <- char '\\' | |
s1 <- (char (chr 0) <|> obs_NO_WS_CTL <|> char '\n' <|> char '\r') | |
return [s0,s1] | |
-- obs-phrase = word *(word / "." / CFWS) | |
obs_phrase :: ParsecT [Char] u IO [Char] | |
obs_phrase = do | |
s0 <- word | |
s1 <- many | |
(try word | |
<|> | |
try (char '.' >> return ".") | |
<|> | |
cfws) | |
return $ s0 ++ (concat s1) | |
runTest p s = do | |
putStrLn $ "Parse: " ++ s | |
parseTest p s | |
runTest2 p s = do | |
putStrLn $ "Parse: " ++ s | |
runParserT p () "" s >>= either print print | |
main = do | |
mapM_ (runTest2 address) | |
[ | |
"takaki@mbox.example.com" | |
, "(Takaki)takaki@mbox.example.com" | |
, "takaki(hoge)@mbox.example.com" | |
, "Jeffy <\"That Tall Guy\"@ora.com (this addres no longer active)>" | |
, "(Takaki)takaki@mbox.example.com" | |
, "takaki@[192.168.0.1]abc" | |
, "takaki@mbox.example.com" | |
, "<takaki@mbox.example.com>" | |
, "Takaki<takaki@mbox.example.com>" | |
, "\"Tak aki\" <takaki@mbox.example.com>" | |
, " Takaki <takaki@mbox.example.com>" | |
, "\"000\" A(B\"C(hh)C\"BB)AA <(pre)\"abcdef\"(po\"<>\"st)@(hoge)example.com (abc\"nest\" def)> (a \"de\" bc)" | |
, "\"!@#..$%^%^&=%\"@example.com" | |
] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment