Skip to content

Instantly share code, notes, and snippets.

@apivovarov
Last active December 3, 2015 01:55
Show Gist options
  • Save apivovarov/16ed9a703a821fd66259 to your computer and use it in GitHub Desktop.
Save apivovarov/16ed9a703a821fd66259 to your computer and use it in GitHub Desktop.
CSV parser in Scala
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import scala.language.postfixOps
import scala.util.parsing.combinator.RegexParsers
// A CSV parser based on RFC4180
// http://tools.ietf.org/html/rfc4180
object CSV extends RegexParsers {
override val skipWhitespace = false // meaningful spaces in CSV
def COMMA = ","
def DQUOTE = "\""
def DQUOTE2 = "\"\"" ^^ { case _ => "\"" } // combine 2 dquotes into 1
def CRLF = "\r\n" | "\n"
def TXT = "[^\",\r\n]".r
def SPACES = "[ \t]+".r
def file: Parser[List[List[String]]] = repsep(record, CRLF) <~ (CRLF?)
def record: Parser[List[String]] = repsep(field, COMMA)
def field: Parser[String] = escaped|nonescaped
def escaped: Parser[String] = {
((SPACES?)~>DQUOTE~>((TXT|COMMA|CRLF|DQUOTE2)*)<~DQUOTE<~(SPACES?)) ^^ {
case ls => ls.mkString("")
}
}
def nonescaped: Parser[String] = (TXT*) ^^ { case ls => ls.mkString("") }
def parse(s: String): List[List[String]] = parseAll(file, s) match {
case Success(res, _) => res
case e => throw new Exception(e.toString)
}
def parseLine(s: String): List[String] = parse(s).head
def main(args: Array[String]) {
assert(CSV.parseLine( """"a",2,"b","c's'",5""") == List("a", "2", "b", "c's'", "5"))
assert(CSV.parseLine( """1997,Ford,E350,"Super, ""luxurious"" truck"""") ==
List("1997", "Ford", "E350", """Super, "luxurious" truck"""))
assert(CSV.parseLine("") == List(""))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment