Skip to content

Instantly share code, notes, and snippets.

@tzachz
Created March 5, 2016 10:18
Show Gist options
  • Save tzachz/2e484273eaa85291c7a9 to your computer and use it in GitHub Desktop.
Save tzachz/2e484273eaa85291c7a9 to your computer and use it in GitHub Desktop.
Spark REST API: calculate time per job name
import java.text.SimpleDateFormat
import java.util.Date
import org.json4s._
import org.json4s.jackson.JsonMethods.parse
import scala.io.Source.fromURL
object SparkAppStats {
val url = "http://<host>:4040/api/v1/applications/<app-name>/jobs"
/**
* (partial) representation of a Spark Stage object
*/
case class SparkJob(jobId: Int, name: String, submissionTime: Date, completionTime: Option[Date], stageIds: List[Int]) {
def getDurationMillis: Option[Long] = completionTime.map(_.getTime - submissionTime.getTime)
}
implicit val formats = new DefaultFormats {
override def dateFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ")
}
/**
* Prints total millis and avg millis per job name, e.g.
* TIME: 182570 AVG: 16597 NAME: count at MyAggregator.scala:132
* TIME: 230973 AVG: 1297 NAME: parquet at MyRepository.scala:99
* TIME: 120393 AVG: 2188 NAME: collect at MyCollector.scala:30
*/
def main (args: Array[String]) {
val json = fromURL(url + "/jobs").mkString
val completedJobs: List[SparkJob] = parse(json)
.extract[List[SparkJob]]
.filter(j => j.getDurationMillis.isDefined)
completedJobs
.groupBy(_.name)
.mapValues(list => (list.map(_.getDurationMillis.get).sum, list.size))
.foreach { case (name, (time, count)) => println(s"TIME: $time\tAVG: ${time / count}\tNAME: $name") }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment