bishabosha/Scala3ProfileParser.scala Secret

## aggregate.sc
//> using toolkit latest
import profileparser.*

val sources = os.list(os.pwd / "input")

println("data:")

def project(source: os.Path) = source.last match
  case s"compile-$target-profile$rest" => target
  case _ => "<unknown>"

def worker(source: os.Path) = source.last match
  case s"compile-$target-profile.csv" => "main"
  case s"compile-$target-profile-worker-$id.csv" => s"worker-$id"
  case _ => "<unknown>"

def simplePhase(phase: String) = phase match
  case s"MegaPhase{$phases}" =>
    val ps = phases.split("  ")
    s"[${ps.head}..${ps.last}]"
  case other => other


case class GanttRow(taskId: String, start: Long, phaseDurations: List[(String, Long)])

val tasks = List.newBuilder[GanttRow]

for
  source <- sources
  ProfileRun(id, _, events) <- parseFile(source.toString)
do
  val proj = project(source)
  val wrk = worker(source)
  println("=========================================")
  println(s"profile run $id for $proj ($wrk)")
  val taskId = s"$proj ($wrk)"
  var startNs = -1L
  val phaseDurations = List.newBuilder[(String, Long)]
  for case Event.PhaseRow(_, start, end, _, _, phase, _, _, _, _, _, _, _, _, _, _) <- events do
    if startNs == -1L then startNs = start
    val phaseName = simplePhase(phase)
    val duration = end - start
    phaseDurations += (phaseName -> duration)
    println(f"$phaseName: ${(end - start).toDouble / 1_000_000}%.3f ms")
  tasks += GanttRow(taskId, startNs, phaseDurations.result())

val rowsRaw = tasks.result()

val initialStart = rowsRaw.map(_.start).min
val maxPhaseCount = rowsRaw.map(_.phaseDurations.length).max

val allPhases = rowsRaw.flatMap(r => r.phaseDurations.map(_(0) -> r.taskId))
val phasesToIds = allPhases.groupMap((phase, task) => phase)((phase, task) => task)

println("=========================================")
println("ANALYSIS")
println("=========================================")

// val taskIds = rowsRaw.map(r => r.taskId).toSet

// val taskPhases = rowsRaw.map(r => r.taskId -> r.phaseDurations).toMap
// var taskToPatchedPhases = taskPhases

// phasesToIds.foreach { case (phase, tasks) =>
//   if tasks.length != taskIds.size then
//     println(s"$phase appears only in: ${tasks.mkString(", ")}")
//     val unseen = taskIds -- tasks
//     unseen.foreach(task =>
//       val taskWithPhase = tasks.head
//       val phases0 = taskToPatchedPhases(taskWithPhase)
//       val (_ :+ (prePhase, _), _) = phases0.span(_(0) != phase): @unchecked
//       val (pre0, post0) = taskToPatchedPhases(task).span(_(0) != prePhase): @unchecked
//       val patched =
//         if post0.isEmpty then
//           ???
//         else
//           val (pre1 :: post) = post0: @unchecked
//           (pre0 ::: pre1 :: (phase, 0L) :: post)
//       println(s"patching $task to add $phase, now ${patched.map(_(0)).mkString(", ")}")
//       taskToPatchedPhases += task -> patched
//     )
// }

// val patchedSizes = taskToPatchedPhases.map((task, phases) => phases.length -> task).groupMap(_._1)(_._2)
// assert(patchedSizes.size == 1, s"not all tasks have the same number of phases: ${patchedSizes.map((s, ps) => s"[$s](${ps.mkString(",")})").mkString(", ")}")

// val maxPhaseCount0 = taskToPatchedPhases.map(_(1).length).max
// assert(maxPhaseCount0 == maxPhaseCount, s"extra phases were added, see ${taskToPatchedPhases.filter(_._2.length != maxPhaseCount).map(_(0)).mkString("\n")}")

// format rowsRaw as csv, normalising the start of each row to the initialStart, and padding the phaseDurations to maxPhaseCount
val rows = rowsRaw.map { row =>
  val start = row.start - initialStart
  val phaseDurations = row.phaseDurations.padTo(maxPhaseCount, ("", 0L))
  // val phaseDurations = taskToPatchedPhases(row.taskId)
  row.taskId :: start.toString :: phaseDurations.map((_, duration) => duration.toString)
}

val header = "Task ID" :: "Start" :: (1 to maxPhaseCount).map(i => s"Phase $i Duration").toList


val asCSV = (header :: rows).map(row => row.mkString(",")).mkString("\n")
os.write.over(os.pwd / "output" / "gantt.csv", asCSV, createFolders = true)

## Scala3ProfileParser.scala
package profileparser

import scala.util.Using
import scala.io.Source

case class ProfileRun(id: Int, target: String, events: List[Event])

enum ThreadState:
  case Main, Background

enum Event:
  case GCRow(
    startNs: Long,
    endNs: Long,
    startMs: Long,
    endMs: Long,
    name: String,
    action: String,
    cause: String,
    threads: Int
  )
  case PhaseRow(
    state: ThreadState,
    startNs: Long,
    endNs: Long,
    runId: Int,
    phaseId: Int,
    phaseName: String,
    purpose: String,
    taskCount: Int,
    threadId: Int,
    threadName: String,
    runNs: Long,
    idleNs: Long,
    cpuTimeNs: Long,
    userTimeNs: Long,
    allocatedByte: Long,
    heapSize: Long
  )

private def parseLine(line: String): Event = {
  val fields = line.split(",")

  def parseMainBackground(state: ThreadState) =
    Event.PhaseRow(
      state,
      fields(1).toLong,
      fields(2).toLong,
      fields(3).toInt,
      fields(4).toInt,
      fields(5),
      fields(6),
      fields(7).toInt,
      fields(8).toInt,
      fields(9),
      fields(10).toLong,
      fields(11).toLong,
      fields(12).toLong,
      fields(13).toLong,
      fields(14).toLong,
      fields(15).toLong
    )

  fields(0) match {
    case "EventType(GC)" =>
      Event.GCRow(
        fields(1).toLong,
        fields(2).toLong,
        fields(3).toLong,
        fields(4).trim.toLong, // dotty adds an extra space here
        fields(5),
        fields(6),
        fields(7),
        fields(8).toInt
      )
    case "EventType(main)" =>
      parseMainBackground(ThreadState.Main)
    case "EventType(background)" =>
      parseMainBackground(ThreadState.Background)
  }
}

def parseFile(filename: String): List[ProfileRun] = {
  Using(Source.fromFile(filename)): source =>
    val lines = source.getLines

    val runs = List.newBuilder[ProfileRun]
    val events = collection.mutable.ListBuffer.empty[Event]

    var inHeader = false
    var csvVersion = -1
    var runId = -1
    var target = ""

    def enterRun() =
      if (events.nonEmpty) {
        runs += ProfileRun(runId, target, events.toList)
        runId = -1
        target = ""
        events.clear()
      }
      inHeader = false
      csvVersion = -1

    for (line <- lines) {
      if (line.startsWith("info,")) {
        assert(!inHeader)
        inHeader = true
        val header = line.split(",").map(_.trim())
        runId = header(1).toInt
        val _ = header(2).ensuring(_ == "version", "unexpected format of csv file")
        csvVersion = header(3).toInt.ensuring(_ == 2, "unexpected format of csv file")
        target = header(5)
      }
      else if (line.startsWith("header(")) {
        assert(inHeader)
      }
      else {
        if inHeader && csvVersion == 2 then
          enterRun()
        events += parseLine(line.trim())
      }
    }
    if events.nonEmpty then
      enterRun()

    runs.result()
}.get
	//> using toolkit latest
	import profileparser.*

	val sources = os.list(os.pwd / "input")

	println("data:")

	def project(source: os.Path) = source.last match
	case s"compile-$target-profile$rest" => target
	case _ => "<unknown>"

	def worker(source: os.Path) = source.last match
	case s"compile-$target-profile.csv" => "main"
	case s"compile-$target-profile-worker-$id.csv" => s"worker-$id"
	case _ => "<unknown>"

	def simplePhase(phase: String) = phase match
	case s"MegaPhase{$phases}" =>
	val ps = phases.split(" ")
	s"[${ps.head}..${ps.last}]"
	case other => other


	case class GanttRow(taskId: String, start: Long, phaseDurations: List[(String, Long)])

	val tasks = List.newBuilder[GanttRow]

	for
	source <- sources
	ProfileRun(id, _, events) <- parseFile(source.toString)
	do
	val proj = project(source)
	val wrk = worker(source)
	println("=========================================")
	println(s"profile run $id for $proj ($wrk)")
	val taskId = s"$proj ($wrk)"
	var startNs = -1L
	val phaseDurations = List.newBuilder[(String, Long)]
	for case Event.PhaseRow(_, start, end, _, _, phase, _, _, _, _, _, _, _, _, _, _) <- events do
	if startNs == -1L then startNs = start
	val phaseName = simplePhase(phase)
	val duration = end - start
	phaseDurations += (phaseName -> duration)
	println(f"$phaseName: ${(end - start).toDouble / 1_000_000}%.3f ms")
	tasks += GanttRow(taskId, startNs, phaseDurations.result())

	val rowsRaw = tasks.result()

	val initialStart = rowsRaw.map(_.start).min
	val maxPhaseCount = rowsRaw.map(_.phaseDurations.length).max

	val allPhases = rowsRaw.flatMap(r => r.phaseDurations.map(_(0) -> r.taskId))
	val phasesToIds = allPhases.groupMap((phase, task) => phase)((phase, task) => task)

	println("=========================================")
	println("ANALYSIS")
	println("=========================================")

	// val taskIds = rowsRaw.map(r => r.taskId).toSet

	// val taskPhases = rowsRaw.map(r => r.taskId -> r.phaseDurations).toMap
	// var taskToPatchedPhases = taskPhases

	// phasesToIds.foreach { case (phase, tasks) =>
	// if tasks.length != taskIds.size then
	// println(s"$phase appears only in: ${tasks.mkString(", ")}")
	// val unseen = taskIds -- tasks
	// unseen.foreach(task =>
	// val taskWithPhase = tasks.head
	// val phases0 = taskToPatchedPhases(taskWithPhase)
	// val (_ :+ (prePhase, _), _) = phases0.span(_(0) != phase): @unchecked
	// val (pre0, post0) = taskToPatchedPhases(task).span(_(0) != prePhase): @unchecked
	// val patched =
	// if post0.isEmpty then
	// ???
	// else
	// val (pre1 :: post) = post0: @unchecked
	// (pre0 ::: pre1 :: (phase, 0L) :: post)
	// println(s"patching $task to add $phase, now ${patched.map(_(0)).mkString(", ")}")
	// taskToPatchedPhases += task -> patched
	// )
	// }

	// val patchedSizes = taskToPatchedPhases.map((task, phases) => phases.length -> task).groupMap(_._1)(_._2)
	// assert(patchedSizes.size == 1, s"not all tasks have the same number of phases: ${patchedSizes.map((s, ps) => s"[$s](${ps.mkString(",")})").mkString(", ")}")

	// val maxPhaseCount0 = taskToPatchedPhases.map(_(1).length).max
	// assert(maxPhaseCount0 == maxPhaseCount, s"extra phases were added, see ${taskToPatchedPhases.filter(_._2.length != maxPhaseCount).map(_(0)).mkString("\n")}")

	// format rowsRaw as csv, normalising the start of each row to the initialStart, and padding the phaseDurations to maxPhaseCount
	val rows = rowsRaw.map { row =>
	val start = row.start - initialStart
	val phaseDurations = row.phaseDurations.padTo(maxPhaseCount, ("", 0L))
	// val phaseDurations = taskToPatchedPhases(row.taskId)
	row.taskId :: start.toString :: phaseDurations.map((_, duration) => duration.toString)
	}

	val header = "Task ID" :: "Start" :: (1 to maxPhaseCount).map(i => s"Phase $i Duration").toList


	val asCSV = (header :: rows).map(row => row.mkString(",")).mkString("\n")
	os.write.over(os.pwd / "output" / "gantt.csv", asCSV, createFolders = true)
	package profileparser

	import scala.util.Using
	import scala.io.Source

	case class ProfileRun(id: Int, target: String, events: List[Event])

	enum ThreadState:
	case Main, Background

	enum Event:
	case GCRow(
	startNs: Long,
	endNs: Long,
	startMs: Long,
	endMs: Long,
	name: String,
	action: String,
	cause: String,
	threads: Int
	)
	case PhaseRow(
	state: ThreadState,
	startNs: Long,
	endNs: Long,
	runId: Int,
	phaseId: Int,
	phaseName: String,
	purpose: String,
	taskCount: Int,
	threadId: Int,
	threadName: String,
	runNs: Long,
	idleNs: Long,
	cpuTimeNs: Long,
	userTimeNs: Long,
	allocatedByte: Long,
	heapSize: Long
	)

	private def parseLine(line: String): Event = {
	val fields = line.split(",")

	def parseMainBackground(state: ThreadState) =
	Event.PhaseRow(
	state,
	fields(1).toLong,
	fields(2).toLong,
	fields(3).toInt,
	fields(4).toInt,
	fields(5),
	fields(6),
	fields(7).toInt,
	fields(8).toInt,
	fields(9),
	fields(10).toLong,
	fields(11).toLong,
	fields(12).toLong,
	fields(13).toLong,
	fields(14).toLong,
	fields(15).toLong
	)

	fields(0) match {
	case "EventType(GC)" =>
	Event.GCRow(
	fields(1).toLong,
	fields(2).toLong,
	fields(3).toLong,
	fields(4).trim.toLong, // dotty adds an extra space here
	fields(5),
	fields(6),
	fields(7),
	fields(8).toInt
	)
	case "EventType(main)" =>
	parseMainBackground(ThreadState.Main)
	case "EventType(background)" =>
	parseMainBackground(ThreadState.Background)
	}
	}

	def parseFile(filename: String): List[ProfileRun] = {
	Using(Source.fromFile(filename)): source =>
	val lines = source.getLines

	val runs = List.newBuilder[ProfileRun]
	val events = collection.mutable.ListBuffer.empty[Event]

	var inHeader = false
	var csvVersion = -1
	var runId = -1
	var target = ""

	def enterRun() =
	if (events.nonEmpty) {
	runs += ProfileRun(runId, target, events.toList)
	runId = -1
	target = ""
	events.clear()
	}
	inHeader = false
	csvVersion = -1

	for (line <- lines) {
	if (line.startsWith("info,")) {
	assert(!inHeader)
	inHeader = true
	val header = line.split(",").map(_.trim())
	runId = header(1).toInt
	val _ = header(2).ensuring(_ == "version", "unexpected format of csv file")
	csvVersion = header(3).toInt.ensuring(_ == 2, "unexpected format of csv file")
	target = header(5)
	}
	else if (line.startsWith("header(")) {
	assert(inHeader)
	}
	else {
	if inHeader && csvVersion == 2 then
	enterRun()
	events += parseLine(line.trim())
	}
	}
	if events.nonEmpty then
	enterRun()

	runs.result()
	}.get