CSV parser in Scala
val pattern = java.util.regex.Pattern.compile ("""(?xs) ("(.*?)"|) ; ("(.*?)"|) (?: \r?\n | \z ) """) | |
val matcher = pattern.matcher (input) | |
while (matcher.find) { | |
val col1 = matcher.group (2) | |
val col2 = matcher.group (4) | |
// ... | |
} |
import scala.util.parsing.combinator._ | |
import scala.util.parsing.combinator.syntactical._ | |
object csvParser extends RegexParsers { | |
test // Perform unit-testing on first load. | |
// Turns off whitespace removal: line separators are an important part of the CSV format... | |
override def skipWhitespace = false | |
def CRLF = "\r\n" | "\n" | |
def EOF = "\\z".r | |
// Any number of columns, but no backtracking over accidental double-quotes. | |
def stringInQuotes = """(?xs) ".*?" |""".r ^^ {case qstr => if (qstr.length != 0) qstr.substring (1, qstr.length - 1) else ""} | |
def line = stringInQuotes ~ ';' ~ stringInQuotes ~ (CRLF | EOF) ^^ {case col1 ~ _ ~ col2 ~ _ => col1 :: col2 :: Nil} | |
// Fixed number of columns, but backtracking over accidental double-quotes works. | |
def unquote (str: String) = str.substring (1, str.length - (str.charAt (str.length - 1) match {case ';'|'\r'|'\n' => 2; case _ => 1})) | |
def col1 = ("(?s)\".*?\";".r ^^ unquote _) | (";" ^^ (_ => "")) | |
def col2 = ("(?s)\".*?\"(\\r|\\n)?".r ^^ unquote _) | (("\r" | "\n" | EOF) ^^ (_ => "")) | |
def twoColumns = col1 ~ col2 ~ opt ("\n") ^^ {case v1 ~ v2 ~ _ => v1 :: v2 :: Nil} | |
def csv: Parser[List[List[String]]] = rep1 (twoColumns) | |
def unwrap[T] (result: ParseResult[T]) = result match { | |
case Success (data, _) => data | |
case f@Failure (message, _) => throw new Exception (f.toString) | |
case e@Error (message, _) => throw new Exception (e.toString) | |
} | |
def test = { | |
def check[T] (s: String, expect: AnyRef): AnyRef = { | |
val result = unwrap (parse (csv, s)) | |
assert (result.toString == expect.toString, "expected: \n" + expect + "\n; got: \n" + result) | |
} | |
check (";", List (List ("", ""))) // One string with both columns absent. | |
check ("\"qq\nqq\";", List (List ("qq\nqq", ""))) | |
check (";\" name1 \n name2 \"", List (List ("", " name1 \n name2 "))) | |
check ("\"qq\";\"zz\nzz\"", List (List ("qq", "zz\nzz"))) | |
check ("\"qq\";\"zz\"\n", List (List ("qq", "zz"))) | |
check (";\n;\n;", List (List ("", ""), List ("", ""), List ("", ""))) | |
check ("\"qq\";\"zz\"\n\"qq\";\"zz\"\n\"qq\";\"zz\"", List (List ("qq", "zz"), List ("qq", "zz"), List ("qq", "zz"))) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment