-
-
Save BrianHicks/165554b033eb797e3ed851964ecb3a38 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module CSV exposing | |
( CSV(..), parse | |
, Separators, defaultSeparators | |
, Plain | |
, firstRowAreNames | |
) | |
{-| Parse CSV data | |
# turning strings into CSVs | |
@docs CSV, parse | |
@docs Separators, defaultSeparators | |
# named fields | |
@docs Plain | |
@docs firstRowAreNames | |
-} | |
import Parser exposing ((|.), (|=), Parser) | |
type alias Row = | |
List String | |
type Plain | |
= Plain | |
type WithNamedFields | |
= WithNamedFields Row | |
| EmptyHeaders | |
type CSV a | |
= CSV a (List Row) | |
type alias Separators = | |
{ value : Char } | |
defaultSeparators : Separators | |
defaultSeparators = | |
{ value = ',' } | |
-- PARSING | |
parse : Separators -> String -> Result (List Parser.DeadEnd) (CSV Plain) | |
parse separators raw = | |
Parser.run (rows separators) raw | |
rows : Separators -> Parser (CSV Plain) | |
rows separators = | |
Parser.map (CSV Plain) (Parser.loop [] (rowsHelp separators)) | |
rowsHelp : Separators -> List Row -> Parser (Parser.Step (List Row) (List Row)) | |
rowsHelp separators revRows = | |
Parser.oneOf | |
[ Parser.end | |
|> Parser.map (\_ -> Parser.Done (List.reverse revRows)) | |
, row separators | |
|> Parser.map (\newRow -> Parser.Loop (newRow :: revRows)) | |
] | |
row : Separators -> Parser Row | |
row separators = | |
Parser.loop [] (rowHelp separators) | |
rowHelp : Separators -> Row -> Parser (Parser.Step Row Row) | |
rowHelp separators revVals = | |
let | |
doneWhen : Parser a -> Parser (Parser.Step Row Row) | |
doneWhen = | |
Parser.map (\_ -> Parser.Done (List.reverse revVals)) | |
nextWhen : Parser String -> Parser (Parser.Step Row Row) | |
nextWhen = | |
Parser.map (\newVal -> Parser.Loop (newVal :: revVals)) | |
in | |
Parser.oneOf | |
[ doneWhen Parser.end | |
, doneWhen (Parser.token "\n") | |
, Parser.token (String.fromChar separators.value) |> skipTo revVals | |
, nextWhen quotedValue | |
-- TODO: token for \r\n after updating elm-format. It automatically | |
-- formats to the wrong/old syntax for specifying codepoints in the | |
-- version I have installed ATM | |
, Parser.chompWhile (\c -> c /= '\n' && c /= separators.value) | |
|> Parser.getChompedString | |
|> nextWhen | |
] | |
quotedValue : Parser String | |
quotedValue = | |
Parser.succeed identity | |
|. Parser.token "\"" | |
|= Parser.loop "" quotedValueHelp | |
|> Parser.andThen | |
(\final -> | |
case final of | |
Ok good -> | |
Parser.succeed good | |
Err err -> | |
Parser.problem err | |
) | |
quotedValueHelp : String -> Parser (Parser.Step String (Result String String)) | |
quotedValueHelp soFar = | |
let | |
subAndLoop : String -> Parser a -> Parser (Parser.Step String b) | |
subAndLoop alt parser = | |
parser | |
|> Parser.map (\_ -> Parser.Loop (soFar ++ alt)) | |
in | |
Parser.oneOf | |
[ Parser.end |> Parser.map (\_ -> Parser.Done (Err "I reached the end of the input while trying to parse a quoted string.")) | |
, Parser.token "\"\"" |> subAndLoop "\"" | |
, Parser.token "\\\"" |> subAndLoop "\"" | |
, Parser.token "\\" |> skipTo soFar | |
, Parser.token "\"" | |
|> Parser.map (\_ -> Parser.Done (Ok soFar)) | |
, Parser.chompWhile (\c -> c /= '\\' && c /= '"') | |
|> Parser.getChompedString | |
|> Parser.map (\newPortion -> Parser.Loop (soFar ++ newPortion)) | |
] | |
skipTo : b -> Parser a -> Parser (Parser.Step b c) | |
skipTo soFar = | |
Parser.map (\_ -> Parser.Loop soFar) | |
-- HEADERS | |
firstRowAreNames : CSV Plain -> CSV WithNamedFields | |
firstRowAreNames (CSV _ rowsAndHeader) = | |
case rowsAndHeader of | |
head :: body -> | |
CSV (WithNamedFields head) body | |
[] -> | |
CSV EmptyHeaders rowsAndHeader |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Apologies for the dumb question (I am new to elm/parser).
If a CSV row contains
,"Some quoted text",
(i.e. quotation mark immediately following a separator), it will extract the text inside quotes as we would expect. But if it contains, "Some quoted text",
(i.e. some whitespace between separator and first quote), then it will extract with escaped quotation marks. How do I adapt the parser to trim whitespace between the separator and the first non-whitespace character?