Skip to content

Instantly share code, notes, and snippets.

@piotrbelina
Created August 3, 2013 10:12
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save piotrbelina/6145972 to your computer and use it in GitHub Desktop.
Save piotrbelina/6145972 to your computer and use it in GitHub Desktop.
Scalding apache log parser for boomerang.js
import cascading.tuple.{Fields, TupleEntry}
import com.twitter.scalding._
import java.net.URLDecoder
import scala.util.matching.Regex
class BoomerangLogJob(args: Args) extends Job(args) {
val input = TextLine(args("input"))
val output = TextLine(args("output"))
val trap = Tsv(args("trap"))
val inputFields = 'line
val regexFields = ('ip, 'time, 'method, 'event)
input
.read
.addTrap(trap)
.mapTo('line -> regexFields) {
te: TupleEntry =>
val regex = new Regex("([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*).*$")
val split = regex.findFirstMatchIn(te.getString("line")).get.subgroups
(split(0), split(1), split(2), split(3))
}
.filter('event) { event: String => event.matches("^/beacon\\.php.*") }
.map('event -> ('url, 'done, 'resp)) { event: String => urlParse(event) }
.groupBy('url) { _.size.average('done).average('resp) }
.groupBy('size) { _.sortBy('url).reverse.take(1000000) }
.write(output)
def urlParse(url: String) = {
def parseQuery = {
val parts = url.split("\\?")
val query = parts(1)
query.split("&").map((param: String) => {
val pair = param.split("=").map {
URLDecoder.decode(_, "UTF-8")
}
pair(0) -> pair(1)
}).toMap
}
val query = parseQuery
(query.get("u").get, query.get("t_done").get, query.get("t_resp").get)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment