Created
July 7, 2015 19:39
-
-
Save ntbrock/995a9d622780f4999928 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.util.parsing.combinator._ | |
class HeaderPsvRegexParser extends HeaderCharacterRegexParser("|") {} | |
class HeaderCsvRegexParser extends HeaderCharacterRegexParser(",") {} | |
class HeaderCharacterRegexParser(delimiterCharacter:String) extends RegexParsers | |
{ | |
// 2012-Jul-20 - The parser was removing splaces. | |
// 2015-Jul-07 - This override is required so that internal spaces remain. | |
override protected val whiteSpace = """[\t]""".r | |
// def COMMA = "," | |
def COMMA = delimiterCharacter | |
def DQUOTE = "\"" | |
def DQUOTE2 = "\"\"" ^^ { case _ => "\"" } | |
def CR = "\r" | |
def LF = "\n" | |
def CRLF = "\r\n" | |
def TXT = ("[^\""+COMMA+"\r\n]").r | |
def TXT_NON_GREEDY = ("[^\""+COMMA+"\r\n]+").r | |
// 2015-Jun-09 Brockman - Attentiveness to whitespace in file | |
def WS_OPTION = "\\s*".r | |
def file: Parser[List[List[String]]] = repsep(record, CRLF) <~ opt(CRLF) | |
def record: Parser[List[String]] = rep1sep(field, COMMA) | |
def field: Parser[String] = (escaped|nonescaped) | |
def escaped: Parser[String] = WS_OPTION~>(DQUOTE~>((TXT|COMMA|CR|LF|DQUOTE2)*)<~DQUOTE)<~WS_OPTION ^^ { case ls => ls.mkString("")} | |
def nonescaped: Parser[String] = WS_OPTION~>(TXT*)<~WS_OPTION ^^ { | |
case ls => ls.mkString("").trim // 2015- Only trim space content around non escaped terms. Example: Spacey , -> "Spacey" | |
} | |
def parse(s: String) = parseAll(file, s) match { | |
case Success(res, _) => res | |
case _ => List[List[String]]() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment