Last active
December 23, 2015 16:09
-
-
Save nfunato/6660441 to your computer and use it in GitHub Desktop.
an exercise of Text.ParserCombinators.ReadP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- CSV file parser (as an exercise of Text.ParserCombinators.ReadP) | |
-- 2013-09-21 @nfunato | |
import Text.ParserCombinators.ReadP | |
import Control.Applicative ((<$>), (<*>), (<*), (*>)) | |
-- NOTE: | |
-- The code here is baesd on d.hatena.ne.jp/kazu-yamamoto/20100104/1262597082 | |
-- which shows code for Parsec2, not ReadP | |
-- BUGS: | |
-- Apparently it cannot handle an escaped dquote. Someday it might be fixed :) | |
-- the top-level API | |
parseCSV :: ReadS [[String]] | |
parseCSV = (parse' eatAll) csvFile | |
-- definitions for parsec2 compatibility | |
eatAll = filter (\(a,str)->str=="") | |
parse' f = (f .) . readP_to_S | |
parse = parse' id -- for interactive tests | |
noneOf = satisfy . flip notElem | |
-- CSV specification (RFC4180) | |
csvFile = record `endBy` eol -- assuming the last eol existence | |
record = field `sepBy1` comma | |
field = nonEscF +++ escF | |
nonEscF = many txData | |
escF = dquote *> many quotedCh <* dquote | |
quotedCh = (dquote>>dquote) <++ comma +++ eol +++ txData | |
dquote = char '"' | |
comma = char ',' | |
txData = noneOf ",\"\r\n" | |
eol = (cr>>lf) <++ cr <++ lf -- only crlf in RFC4180, but it's practical | |
cr = char '\r' | |
lf = char '\n' | |
{- | |
-- | |
-- test data (which might include Japanese text in utf-8 encoding at some time) | |
-- | |
-- Set1 (RWH Ch.16) | |
"" | |
"hi" | |
"hi\n" | |
"line1\nline2\nline3\n" | |
"cell1,cell2,cell3\n" | |
"l1c1,l1c2\nl2c1,l2c2\n" | |
"Hi,\n\n,Hello\n" | |
-- Set2 (escaping) | |
"\"cell1\",cell2,\"\"\n" | |
"\"Product\",\"Price\"\n" | |
"\"O'Reilly Socks\",10\n" | |
"\"Shirt with \"\"Haskell\"\" text\",20\n" | |
"\"Shirt, \"\"O'Reilly\"\" version\",20\n" | |
"\"Haskell Caps\",15\n" | |
-} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment