Created
November 29, 2012 14:41
-
-
Save takaki/4169514 to your computer and use it in GitHub Desktop.
Parsec: RFC 5322 (RFC 822) email address parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Debug.Trace | |
import Control.Monad.State | |
import Text.Parsec | |
import Data.Char | |
import Control.Monad.Trans | |
import Control.Applicative hiding((<|>),many,optional) | |
-- trace s f = f | |
-- ref. RFC5322 and RFC5234 | |
--debug_putStrLn = putStrLn | |
debug_putStrLn = return | |
debug_print s0 = liftIO $ debug_putStrLn $ s0 | |
debug_in s0 = debug_print $ "[IN] " ++ s0 | |
debug_return s0 s1 = do | |
liftIO $ debug_putStrLn $ "[OUT] " ++ s0 ++ " : [" ++ s1 ++ "]" | |
return s1 | |
-- address = mailbox / group | |
address :: ParsecT [Char] u IO [Char] | |
address = do | |
debug_in "address" | |
s0 <- mailbox <|> group | |
-- debug_return "address" s0 | |
return s0 | |
-- mailbox = name-addr / addr-spec | |
mailbox = do | |
debug_in "mailbox" | |
s0 <- try(name_addr) <|> addr_spec | |
debug_return "mailbox" s0 | |
-- name-addr = [display-name] angle-addr | |
name_addr = do | |
debug_in "name_addr" | |
s0 <- option "" display_name | |
s1 <- angle_addr | |
liftIO $ putStrLn $ " DISPLAY_NAME: " ++ s0 | |
liftIO $ putStrLn $ " ADDDR_SPEC: " ++ s1 | |
debug_return "name_addr" $ s0 ++ s1 | |
-- angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / | |
-- obs-angle-addr | |
angle_addr = do | |
debug_in "angle_addr" | |
s0 <- option "" cfws | |
s1 <- char '<' | |
s2 <- addr_spec | |
s3 <- char '>' | |
s4 <- option "" cfws | |
liftIO $ putStrLn $ " ADDDR_SPEC: " ++ s2 | |
debug_return "angle_addr" $ s0 ++ [s1] ++ s2 ++ [s3] ++ s4 | |
<|> | |
obs_angle_addr | |
-- group = display-name ":" [group-list] ";" [CFWS] | |
group = do | |
debug_in "group" | |
s0 <- display_name | |
s1 <- char ':' | |
s2 <- option "" group_list | |
s3 <- char ';' | |
s4 <- option "" cfws | |
debug_return "group" $ s0 ++ [s1] ++ s2 ++ [s3] ++ s4 | |
-- display-name = phrase | |
display_name = do | |
debug_in "display_name" | |
s0 <- phrase | |
debug_return "display_name" s0 | |
-- mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list | |
mailbox_list :: ParsecT [Char] u IO [Char] | |
mailbox_list = do { | |
s0 <- mailbox ; | |
s1 <- many (do | |
s0 <- char ',' | |
s1 <- mailbox | |
return $ [s0] ++ s1 | |
); | |
debug_return "mailbox_list" $ s0 ++ (concat s1) | |
} <|> | |
obs_mbox_list | |
-- address-list = (address *("," address)) / obs-addr-list | |
{- | |
address_list :: ParsecT [Char] u IO [Char] | |
address_list = try(do { | |
address ; | |
many (char ',' >> address) | |
} ) <|> | |
obs_addr_list | |
-} | |
-- group-list = mailbox-list / CFWS / obs-group-list | |
group_list = do | |
mailbox_list | |
<|> | |
cfws | |
<|> | |
obs_group_list | |
-- addr-spec = local-part "@" domain | |
addr_spec = do | |
debug_in "addr_spec" | |
s0 <- local_part | |
s1 <- char '@' | |
s2 <- domain | |
liftIO $ putStrLn $ " LOCAL-PART: " ++ s0 | |
liftIO $ putStrLn $ " DOMAIN: " ++ s2 | |
debug_return "addr_spec" $ s0 ++ [s1] ++ s2 | |
-- local-part = dot-atom / quoted-string / obs-local-part | |
local_part = do | |
debug_in "local_part" | |
s0 <- (try dot_atom) <|> try quoted_string <|> obs_local_part | |
debug_return "local_part" s0 | |
-- domain = dot-atom / domain-literal / obs-domain | |
domain = do | |
debug_in "domain" | |
s0 <- try dot_atom <|> try domain_literal <|> obs_domain | |
debug_return "domain" s0 | |
domain_literal = do | |
s0 <- option "" cfws | |
s1 <- char '[' | |
s2 <- many (do | |
s0 <- option "" fws | |
s1 <- dtext | |
return $ s0 ++ s1 | |
) | |
s3 <- option "" fws | |
s4 <- char ']' | |
s5 <- option "" cfws | |
debug_return "domain_literal" $ s0 ++ [s1] ++ (concat s2) ++ s3 ++ [s4] ++ s5 | |
dtext = do | |
debug_in "dtext" | |
s0 <- do { | |
s0 <- oneOf $ map chr ([33..90] ++ [94..126]); | |
return [s0] | |
}<|> obs_dtext | |
debug_return "dtext" s0 | |
-- CFWS = (1*([FWS] comment) [FWS]) / FWS | |
cfws :: ParsecT [Char] u IO [Char] | |
cfws = do | |
debug_in "cfws" | |
s0 <- try(do { | |
s0 <- many1 (do | |
s0 <- option "" fws | |
s1 <- comment | |
return $ s0 ++ s1 | |
); | |
s1 <- option "" fws; | |
return $ (concat s0) ++ s1 | |
} ) <|> fws | |
-- debug_return "cfws" s0 | |
debug_return "cfws" "" | |
-- FWS = ([*WSP CRLF] 1*WSP) / obs-FWS | |
fws :: ParsecT [Char] u IO [Char] | |
fws = do | |
debug_in "fws" | |
s0 <- try(do { | |
s0 <- try (option "" (do | |
s0 <- many wsp | |
s1 <- crlf | |
return $ s0 ++ s1 | |
)); | |
s1 <- many1 wsp; | |
return $ s0 ++ s1 | |
} ) <|> obs_fws | |
debug_return "fws" s0 | |
wsp = do | |
s1 <- char ' ' <|> char '\t' <?> "@wsp" | |
-- debug_print "wsp" $ "'" ++ [s1] ++ "'" | |
return s1 | |
-- obs-FWS = 1*WSP *(CRLF 1*WSP) | |
obs_fws = do | |
s0 <- many1 wsp | |
s1 <- many (do | |
s0 <- crlf | |
s1 <- many1 wsp | |
return $ s0 ++ s1 | |
) | |
debug_return "OBS_fws" $ s0 ++ (concat s1) | |
crlf = do | |
s1 <- ( do | |
s0 <- char '\r' | |
s1 <- char '\n' | |
return $ [s0, s1] | |
) <?> "@crlf" | |
debug_return "crlf" s1 | |
ctext = | |
try (oneOf $ map chr ([33..39] ++ [42..91] ++ [93..126])) | |
<|> | |
obs_ctext | |
-- ccontent = ctext / quoted-pair / comment | |
ccontent :: ParsecT [Char] u IO [Char] | |
ccontent = do | |
debug_in "ccontent" | |
s0 <- ((try ctext) >>= return . pure) | |
<|> | |
(try quoted_pair) | |
<|> | |
comment | |
debug_return "ccontent" s0 | |
-- comment = "(" *([FWS] ccontent) [FWS] ")" | |
comment = do | |
debug_in "comment" | |
s0 <- char '(' | |
s1 <- many (do | |
s0 <- option "" fws | |
s1 <- ccontent | |
return $ s0 ++ s1 | |
) | |
s2 <- option "" fws | |
s3 <- char ')' | |
debug_return "comment" $ [s0] ++ (concat s1) ++ s2 ++ [s3] | |
-- obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS] | |
obs_angle_addr = do | |
s0 <- option "" cfws | |
s1 <- char '<' | |
s2 <- obs_route | |
s3 <- addr_spec | |
s4 <- char '>' | |
s5 <- option "" cfws | |
debug_return "OBS_angle_addr" $ s0 ++ [s1] ++ s2 ++ s3 ++ [s4] ++ s5 | |
-- obs-route = obs-domain-list ":" | |
obs_route = do | |
s0 <- obs_domain_list | |
s1 <- char ':' | |
debug_return "OBS_route" $ s0 ++ [s1] | |
-- obs-domain-list = *(CFWS / ",") "@" domain | |
-- *("," [CFWS] ["@" domain]) | |
obs_domain_list = do | |
s0 <- many (cfws <|> (char ',' >> return ",")) | |
s1 <- char '@' | |
s2 <- domain | |
s3 <- many (do | |
s0 <- char ',' | |
s1 <- option "" cfws | |
s2 <- option "" (do | |
s0 <- char '@' | |
s1 <- domain | |
return $ [s0] ++ s1) | |
return $ [s0] ++ s1 ++ s2) | |
debug_return "OBS_domain_list" $ (concat s0) ++ [s1] ++ s2 ++ (concat s3) | |
-- obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]) | |
obs_mbox_list = do | |
s0 <- many (do | |
s0 <- cfws | |
s1 <- char ',' | |
return $ s0 ++ [s1] | |
) | |
s1 <- mailbox | |
s2 <- many (do | |
s0 <- char ',' | |
s1 <- option "" (mailbox <|> cfws) | |
return $ [s0] ++ s1 | |
) | |
debug_return "OBS_mbox_list" $ (concat s0) ++ s1 ++ (concat s2) | |
-- obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]) | |
{- | |
obs_addr_list :: ParsecT [Char] u IO [Char] | |
obs_addr_list = do | |
s0 <- many (do | |
s0 <- option "" cfws | |
s1 <- char ',' | |
return $ s0 ++ [s1] | |
) | |
s1 <- address | |
s2 <- many (do | |
s0 <- char ',' | |
s1 <- option "" (address <|> cfws) | |
return $ [s0] ++ s1) | |
return $ (concat s0) ++ s1 ++ (concat s2) | |
-} | |
-- obs-group-list = 1*([CFWS] ",") [CFWS] | |
obs_group_list = do | |
s0 <- many1 (do | |
s0 <- option "" cfws | |
s1 <- char ',' | |
return $ s0 ++ [s1] | |
) | |
s1 <- option "" cfws | |
return $ (concat s0) ++ s1 | |
-- obs-local-part = word *("." word) | |
obs_local_part = do | |
s0 <- word | |
s1 <- many (do | |
s0 <- char '.' | |
s1 <- word | |
return $ [s0] ++ s1 | |
) | |
return $ s0 ++ (concat s1) | |
-- obs-domain = atom *("." atom) | |
obs_domain = do | |
s0 <- atom | |
s1 <- many (char '.' >> atom) | |
return $ s0 ++ (concat s1) | |
-- obs-dtext = obs-NO-WS-CTL / quoted-pair | |
obs_dtext :: ParsecT [Char] u IO [Char] | |
obs_dtext = do | |
debug_in "OBS_dtext" | |
(obs_NO_WS_CTL >>= return . pure) | |
<|> | |
quoted_pair | |
word :: ParsecT [Char] u IO [Char] | |
word = do | |
debug_in "word" | |
s0 <- try atom <|> quoted_string | |
debug_return "word" s0 | |
phrase = do | |
debug_in "phrase" | |
s0 <- (try(many1 word)) <|> (obs_phrase >>= return . pure) | |
debug_return "phrase" $ concat s0 | |
atext = do | |
s0 <- (oneOf (['a'..'z'] ++ ['A'..'Z']) | |
<|> | |
oneOf ['0'..'9'] | |
<|> | |
oneOf "!#$%&'*+-/=?^_`{|}~") | |
debug_print $ "[OUT] atext: " ++ [s0] | |
return s0 | |
-- atom = [CFWS] 1*atext [CFWS] | |
atom = do | |
debug_in "atom" | |
s0 <- option "" cfws | |
s1 <- many1 atext | |
s2 <- option "" cfws | |
debug_return "atom" $ s0 ++ s1 ++ s2 | |
-- dot-atom = [CFWS] dot-atom-text [CFWS] | |
dot_atom = do | |
debug_in "dot_atom" | |
s0 <- option "" cfws | |
s1 <- dot_atom_text | |
s2 <- option "" cfws | |
--debug_return "dot_atom" $ "[-" ++ s0 ++ "-]" ++ s1 ++ "[-" ++ s2 ++ "-]" | |
debug_return "dot_atom" $ s0 ++ s1 ++ s2 | |
--debug_return "dot_atom" s1 | |
-- dot-atom-text = 1*atext *("." 1*atext) | |
dot_atom_text = do | |
debug_in "dot_atom_text" | |
s0 <- many1 atext | |
s1 <- many (do | |
s0 <- char '.' | |
s1 <- many1 atext | |
return $ [s0] ++ s1 | |
) | |
debug_return "dot_atom_text" $ s0 ++ (concat s1) | |
qtext = | |
(try(oneOf $ map chr $ [33] ++ [35..91] ++ [93..126]) | |
<|> | |
obs_qtext) >>= (debug_return "qtext" . pure) | |
qcontent :: ParsecT [Char] u IO [Char] | |
qcontent = qtext <|> quoted_pair | |
quoted_pair :: ParsecT [Char] u IO [Char] | |
quoted_pair = | |
(do | |
s0 <- char '\\' | |
s1 <- vchar <|> wsp | |
return $ [s0,s1] | |
) | |
<|> obs_qp | |
quoted_string :: ParsecT [Char] u IO [Char] | |
quoted_string = do | |
debug_in "quoted_string" | |
s0 <- option "" cfws | |
s1 <- char '"' | |
s2 <- many (do | |
s1 <- option "" fws | |
s2 <- qcontent | |
return $ s1 ++ s2 | |
) | |
s3 <- option "" fws | |
s4 <- char '"' | |
s5 <- option "" cfws | |
debug_return "quoted_string" $ s0 ++ [s1] ++ (concat s2) ++ s3 ++ [s4] ++ s5 | |
vchar = oneOf $ map chr [0x21..0x7e] | |
obs_NO_WS_CTL = do | |
s0 <- (oneOf $ map chr ([1..8] ++ [11] ++ [12] ++ [14..31] ++ [127])) | |
return s0 | |
obs_ctext = obs_NO_WS_CTL | |
obs_qtext = obs_NO_WS_CTL | |
obs_qp = do | |
s0 <- char '\\' | |
s1 <- (char (chr 0) <|> obs_NO_WS_CTL <|> char '\n' <|> char '\r') | |
return [s0,s1] | |
-- obs-phrase = word *(word / "." / CFWS) | |
obs_phrase :: ParsecT [Char] u IO [Char] | |
obs_phrase = do | |
s0 <- word | |
s1 <- many | |
(try word | |
<|> | |
try (char '.' >> return ".") | |
<|> | |
cfws) | |
return $ s0 ++ (concat s1) | |
runTest p s = do | |
putStrLn $ "Parse: " ++ s | |
parseTest p s | |
runTest2 p s = do | |
putStrLn $ "Parse: " ++ s | |
runParserT p () "" s >>= either print print | |
main = do | |
mapM_ (runTest2 address) | |
[ | |
"takaki@mbox.example.com" | |
, "(Takaki)takaki@mbox.example.com" | |
, "takaki(hoge)@mbox.example.com" | |
, "Jeffy <\"That Tall Guy\"@ora.com (this addres no longer active)>" | |
, "(Takaki)takaki@mbox.example.com" | |
, "takaki@[192.168.0.1]abc" | |
, "takaki@mbox.example.com" | |
, "<takaki@mbox.example.com>" | |
, "Takaki<takaki@mbox.example.com>" | |
, "\"Tak aki\" <takaki@mbox.example.com>" | |
, " Takaki <takaki@mbox.example.com>" | |
, "\"000\" A(B\"C(hh)C\"BB)AA <(pre)\"abcdef\"(po\"<>\"st)@(hoge)example.com (abc\"nest\" def)> (a \"de\" bc)" | |
, "\"!@#..$%^%^&=%\"@example.com" | |
] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment