Skip to content

Instantly share code, notes, and snippets.

@ntbrock
Created July 7, 2015 19:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ntbrock/e6ad9543d6804a14c190 to your computer and use it in GitHub Desktop.
Save ntbrock/e6ad9543d6804a14c190 to your computer and use it in GitHub Desktop.
import scala.util.parsing.combinator._
class HeaderPsvRegexParser extends HeaderCharacterRegexParser("|") {}
class HeaderCsvRegexParser extends HeaderCharacterRegexParser(",") {}
class HeaderCharacterRegexParser(delimiterCharacter:String) extends RegexParsers
{
// 2012-Jul-20 - The parser was removing splaces.
// 2015-Jul-07 - This override is required so that internal spaces remain.
override protected val whiteSpace = """[\t]""".r
// def COMMA = ","
def COMMA = delimiterCharacter
def DQUOTE = "\""
def DQUOTE2 = "\"\"" ^^ { case _ => "\"" }
def CR = "\r"
def LF = "\n"
def CRLF = "\r\n"
def TXT = ("[^\""+COMMA+"\r\n]").r
def TXT_NON_GREEDY = ("[^\""+COMMA+"\r\n]+").r
// 2015-Jun-09 Brockman - Attentiveness to whitespace in file
def WS_OPTION = "\\s*".r
def file: Parser[List[List[String]]] = repsep(record, CRLF) <~ opt(CRLF)
def record: Parser[List[String]] = rep1sep(field, COMMA)
def field: Parser[String] = (escaped|nonescaped)
def escaped: Parser[String] = WS_OPTION~>(DQUOTE~>((TXT|COMMA|CR|LF|DQUOTE2)*)<~DQUOTE)<~WS_OPTION ^^ { case ls => ls.mkString("")}
def nonescaped: Parser[String] = WS_OPTION~>(TXT*)<~WS_OPTION ^^ {
case ls => ls.mkString("").trim // 2015- Only trim space content around non escaped terms. Example: Spacey , -> "Spacey"
}
def parse(s: String) = parseAll(file, s) match {
case Success(res, _) => res
case _ => List[List[String]]()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment