public
Last active

CSV parser in Scala

  • Download Gist
alternative, using regex
1 2 3 4 5 6 7
val pattern = java.util.regex.Pattern.compile ("""(?xs) ("(.*?)"|) ; ("(.*?)"|) (?: \r?\n | \z ) """)
val matcher = pattern.matcher (input)
while (matcher.find) {
val col1 = matcher.group (2)
val col2 = matcher.group (4)
// ...
}
csvParser.scala
Scala
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
import scala.util.parsing.combinator._
import scala.util.parsing.combinator.syntactical._
object csvParser extends RegexParsers {
test // Perform unit-testing on first load.
 
// Turns off whitespace removal: line separators are an important part of the CSV format...
override def skipWhitespace = false
 
def CRLF = "\r\n" | "\n"
def EOF = "\\z".r
 
// Any number of columns, but no backtracking over accidental double-quotes.
def stringInQuotes = """(?xs) ".*?" |""".r ^^ {case qstr => if (qstr.length != 0) qstr.substring (1, qstr.length - 1) else ""}
def line = stringInQuotes ~ ';' ~ stringInQuotes ~ (CRLF | EOF) ^^ {case col1 ~ _ ~ col2 ~ _ => col1 :: col2 :: Nil}
 
// Fixed number of columns, but backtracking over accidental double-quotes works.
def unquote (str: String) = str.substring (1, str.length - (str.charAt (str.length - 1) match {case ';'|'\r'|'\n' => 2; case _ => 1}))
def col1 = ("(?s)\".*?\";".r ^^ unquote _) | (";" ^^ (_ => ""))
def col2 = ("(?s)\".*?\"(\\r|\\n)?".r ^^ unquote _) | (("\r" | "\n" | EOF) ^^ (_ => ""))
def twoColumns = col1 ~ col2 ~ opt ("\n") ^^ {case v1 ~ v2 ~ _ => v1 :: v2 :: Nil}
 
def csv: Parser[List[List[String]]] = rep1 (twoColumns)
 
def unwrap[T] (result: ParseResult[T]) = result match {
case Success (data, _) => data
case f@Failure (message, _) => throw new Exception (f.toString)
case e@Error (message, _) => throw new Exception (e.toString)
}
 
def test = {
def check[T] (s: String, expect: AnyRef): AnyRef = {
val result = unwrap (parse (csv, s))
assert (result.toString == expect.toString, "expected: \n" + expect + "\n; got: \n" + result)
}
check (";", List (List ("", ""))) // One string with both columns absent.
check ("\"qq\nqq\";", List (List ("qq\nqq", "")))
check (";\" name1 \n name2 \"", List (List ("", " name1 \n name2 ")))
check ("\"qq\";\"zz\nzz\"", List (List ("qq", "zz\nzz")))
check ("\"qq\";\"zz\"\n", List (List ("qq", "zz")))
check (";\n;\n;", List (List ("", ""), List ("", ""), List ("", "")))
check ("\"qq\";\"zz\"\n\"qq\";\"zz\"\n\"qq\";\"zz\"", List (List ("qq", "zz"), List ("qq", "zz"), List ("qq", "zz")))
}
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.