Skip to content

Instantly share code, notes, and snippets.

@cwpenhale
Last active May 17, 2018 04:40
Show Gist options
  • Save cwpenhale/ea74cc98f0bdf706cbcbdb82ce7b844d to your computer and use it in GitHub Desktop.
Save cwpenhale/ea74cc98f0bdf706cbcbdb82ce7b844d to your computer and use it in GitHub Desktop.
Design a program that returns the most frequently occuring character in a 100 TB string. You can assume any number of machines and storage.
package com.connorpenhale.examples.scala
import org.apache.camel.CamelContext
import org.apache.camel.Exchange
import org.apache.camel.scala.dsl.builder.ScalaRouteBuilder
import org.apache.camel.processor.aggregate.GroupedExchangeAggregationStrategy
/**
* A Camel Router using the Scala DSL
*/
class MyRouteBuilder(override val context: CamelContext) extends ScalaRouteBuilder(context) {
"file://./bigFiles/?fileName=100TBFile.txt" ==> {
split(_.in[String].toArray).parallelProcessing {
process { e =>
if (e.getIn.getBody(classOf[String]).matches("\\ ")) {
e.getIn.setBody("space")
}
if (e.getIn.getBody(classOf[String]).matches("\\n")) {
e.getIn.setBody("newline")
}
if (e.getIn.getBody(classOf[String]).matches("\\r")) {
e.getIn.setBody("return")
}
}
setHeader("character", e => e.getIn.getBody(classOf[String]))
aggregate(e => e.getIn.getHeader("character", classOf[String]), new GroupedExchangeAggregationStrategy()).completionTimeout(60000) {
process { e =>
val character = e.getIn.getBody(classOf[List[Exchange]])(0).getIn.getBody(classOf[String])
e.getIn.setHeader("character", character)
}
log("Aggregate ${in.header.character} ${in.body.size}")
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment