douglaz/logFilter.scala

## logFilter.scala
/*
For lines:
15/08/23 23:29:02 INFO cluster.SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@ip-172-24-20-154.ec2.internal:58026/user/Executor#1085281150] with ID 11
15/08/23 23:29:02 INFO scheduler.TaskSetManager: Starting task 4.0 in stage 0.0 (TID 4, ip-172-24-20-154.ec2.internal, PROCESS_LOCAL, 1314 bytes)
15/08/23 23:29:02 INFO scheduler.TaskSetManager: Starting task 5.0 in stage 0.0 (TID 5, ip-172-24-20-154.ec2.internal, PROCESS_LOCAL, 1314 bytes)

returns 1314 + 1314 = 2628

Note: this is a simplified example,
it won't work for malformed lines or if there are no matching lines
*/

val byteMatcher = ".*, ([0-9]+) bytes".r

val sumBytes = lines
  .filter(_.contains("Starting task "))
  .flatMap(line => byteMatcher.findFirstMatchIn(line))
  .map(regex => regex.group(1).toInt)
  .reduce(_ + _)
	/*
	For lines:
	15/08/23 23:29:02 INFO cluster.SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@ip-172-24-20-154.ec2.internal:58026/user/Executor#1085281150] with ID 11
	15/08/23 23:29:02 INFO scheduler.TaskSetManager: Starting task 4.0 in stage 0.0 (TID 4, ip-172-24-20-154.ec2.internal, PROCESS_LOCAL, 1314 bytes)
	15/08/23 23:29:02 INFO scheduler.TaskSetManager: Starting task 5.0 in stage 0.0 (TID 5, ip-172-24-20-154.ec2.internal, PROCESS_LOCAL, 1314 bytes)

	returns 1314 + 1314 = 2628

	Note: this is a simplified example,
	it won't work for malformed lines or if there are no matching lines
	*/

	val byteMatcher = ".*, ([0-9]+) bytes".r

	val sumBytes = lines
	.filter(_.contains("Starting task "))
	.flatMap(line => byteMatcher.findFirstMatchIn(line))
	.map(regex => regex.group(1).toInt)
	.reduce(_ + _)