Skip to content

Instantly share code, notes, and snippets.

@XuefengWu
Created November 17, 2013 23:59
Show Gist options
  • Save XuefengWu/7520046 to your computer and use it in GitHub Desktop.
Save XuefengWu/7520046 to your computer and use it in GitHub Desktop.
CSV.parse and url regular expression
package myaccount
import scala.util.parsing.combinator.RegexParsers
object CatchingUrl extends App {
val path = "/Users/twer/Downloads/Enquiries - 107718461.csv"
val urlPat = """(.*)([ \t\r\n\f=]+(https?://)?([\d\w-]+\.)+[A-Za-z]+[\w- ./?%&=]*)(.*)""".r
val lines = CSV.parse(path)
lines.map {
l => {
val c = l.getOrElse("Comments", "")
println
println(c)
println(urlPat.findFirstIn(c).isDefined)
}
}
}
object CSV extends RegexParsers {
override val skipWhitespace = false // meaningful spaces in CSV
private def COMMA = ","
private def DQUOTE = "\""
private def DQUOTE2 = "\"\"" ^^ {
case _ => "\""
}
// combine 2 dquotes into 1
private def CRLF = "\r\n" | "\n"
private def TXT = "[^\",\r\n]".r
private def SPACES = "[ \t]+".r
private def file: Parser[List[List[String]]] = repsep(record, CRLF) <~ (CRLF ?)
private def record: Parser[List[String]] = repsep(field, COMMA)
private def field: Parser[String] = escaped | nonescaped
private def escaped: Parser[String] = {
((SPACES ?) ~> DQUOTE ~> ((TXT | COMMA | CRLF | DQUOTE2) *) <~ DQUOTE <~ (SPACES ?)) ^^ {
case ls => ls.mkString("")
}
}
private def nonescaped: Parser[String] = (TXT *) ^^ {
case ls => ls.mkString("")
}
private def parseIo(i: scala.io.BufferedSource): List[Map[String, String]] = parseString(i.getLines.mkString("\r\n"))
private def parseString(s: String): List[Map[String, String]] = parseAll(file, s) match {
case Success(alllines, _) =>
val head = alllines.head
alllines.drop(1) map {
line =>
var theMap = Map[String, String]()
head.zipWithIndex.map {
e =>
val fieldName = e._1.replaceAll("\\s+", "")
if (fieldName != "") theMap = theMap ++ Map(fieldName -> line(e._2))
}
theMap
}
case _ => List[Map[String, String]]()
}
def parse(path: String): List[Map[String, String]] = parseIo(scala.io.Source.fromFile(path))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment