Created
November 17, 2013 23:59
-
-
Save XuefengWu/7520046 to your computer and use it in GitHub Desktop.
CSV.parse and url regular expression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package myaccount | |
import scala.util.parsing.combinator.RegexParsers | |
object CatchingUrl extends App { | |
val path = "/Users/twer/Downloads/Enquiries - 107718461.csv" | |
val urlPat = """(.*)([ \t\r\n\f=]+(https?://)?([\d\w-]+\.)+[A-Za-z]+[\w- ./?%&=]*)(.*)""".r | |
val lines = CSV.parse(path) | |
lines.map { | |
l => { | |
val c = l.getOrElse("Comments", "") | |
println | |
println(c) | |
println(urlPat.findFirstIn(c).isDefined) | |
} | |
} | |
} | |
object CSV extends RegexParsers { | |
override val skipWhitespace = false // meaningful spaces in CSV | |
private def COMMA = "," | |
private def DQUOTE = "\"" | |
private def DQUOTE2 = "\"\"" ^^ { | |
case _ => "\"" | |
} | |
// combine 2 dquotes into 1 | |
private def CRLF = "\r\n" | "\n" | |
private def TXT = "[^\",\r\n]".r | |
private def SPACES = "[ \t]+".r | |
private def file: Parser[List[List[String]]] = repsep(record, CRLF) <~ (CRLF ?) | |
private def record: Parser[List[String]] = repsep(field, COMMA) | |
private def field: Parser[String] = escaped | nonescaped | |
private def escaped: Parser[String] = { | |
((SPACES ?) ~> DQUOTE ~> ((TXT | COMMA | CRLF | DQUOTE2) *) <~ DQUOTE <~ (SPACES ?)) ^^ { | |
case ls => ls.mkString("") | |
} | |
} | |
private def nonescaped: Parser[String] = (TXT *) ^^ { | |
case ls => ls.mkString("") | |
} | |
private def parseIo(i: scala.io.BufferedSource): List[Map[String, String]] = parseString(i.getLines.mkString("\r\n")) | |
private def parseString(s: String): List[Map[String, String]] = parseAll(file, s) match { | |
case Success(alllines, _) => | |
val head = alllines.head | |
alllines.drop(1) map { | |
line => | |
var theMap = Map[String, String]() | |
head.zipWithIndex.map { | |
e => | |
val fieldName = e._1.replaceAll("\\s+", "") | |
if (fieldName != "") theMap = theMap ++ Map(fieldName -> line(e._2)) | |
} | |
theMap | |
} | |
case _ => List[Map[String, String]]() | |
} | |
def parse(path: String): List[Map[String, String]] = parseIo(scala.io.Source.fromFile(path)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment