Created
May 21, 2009 16:29
-
-
Save ArtemGr/115557 to your computer and use it in GitHub Desktop.
CSV parser in Scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val pattern = java.util.regex.Pattern.compile ("""(?xs) ("(.*?)"|) ; ("(.*?)"|) (?: \r?\n | \z ) """) | |
val matcher = pattern.matcher (input) | |
while (matcher.find) { | |
val col1 = matcher.group (2) | |
val col2 = matcher.group (4) | |
// ... | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.util.parsing.combinator._ | |
import scala.util.parsing.combinator.syntactical._ | |
object csvParser extends RegexParsers { | |
test // Perform unit-testing on first load. | |
// Turns off whitespace removal: line separators are an important part of the CSV format... | |
override def skipWhitespace = false | |
def CRLF = "\r\n" | "\n" | |
def EOF = "\\z".r | |
// Any number of columns, but no backtracking over accidental double-quotes. | |
def stringInQuotes = """(?xs) ".*?" |""".r ^^ {case qstr => if (qstr.length != 0) qstr.substring (1, qstr.length - 1) else ""} | |
def line = stringInQuotes ~ ';' ~ stringInQuotes ~ (CRLF | EOF) ^^ {case col1 ~ _ ~ col2 ~ _ => col1 :: col2 :: Nil} | |
// Fixed number of columns, but backtracking over accidental double-quotes works. | |
def unquote (str: String) = str.substring (1, str.length - (str.charAt (str.length - 1) match {case ';'|'\r'|'\n' => 2; case _ => 1})) | |
def col1 = ("(?s)\".*?\";".r ^^ unquote _) | (";" ^^ (_ => "")) | |
def col2 = ("(?s)\".*?\"(\\r|\\n)?".r ^^ unquote _) | (("\r" | "\n" | EOF) ^^ (_ => "")) | |
def twoColumns = col1 ~ col2 ~ opt ("\n") ^^ {case v1 ~ v2 ~ _ => v1 :: v2 :: Nil} | |
def csv: Parser[List[List[String]]] = rep1 (twoColumns) | |
def unwrap[T] (result: ParseResult[T]) = result match { | |
case Success (data, _) => data | |
case f@Failure (message, _) => throw new Exception (f.toString) | |
case e@Error (message, _) => throw new Exception (e.toString) | |
} | |
def test = { | |
def check[T] (s: String, expect: AnyRef): AnyRef = { | |
val result = unwrap (parse (csv, s)) | |
assert (result.toString == expect.toString, "expected: \n" + expect + "\n; got: \n" + result) | |
} | |
check (";", List (List ("", ""))) // One string with both columns absent. | |
check ("\"qq\nqq\";", List (List ("qq\nqq", ""))) | |
check (";\" name1 \n name2 \"", List (List ("", " name1 \n name2 "))) | |
check ("\"qq\";\"zz\nzz\"", List (List ("qq", "zz\nzz"))) | |
check ("\"qq\";\"zz\"\n", List (List ("qq", "zz"))) | |
check (";\n;\n;", List (List ("", ""), List ("", ""), List ("", ""))) | |
check ("\"qq\";\"zz\"\n\"qq\";\"zz\"\n\"qq\";\"zz\"", List (List ("qq", "zz"), List ("qq", "zz"), List ("qq", "zz"))) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment