Skip to content

Instantly share code, notes, and snippets.

@ASRagab
Created November 15, 2015 05:35
Show Gist options
  • Save ASRagab/8e52e495896ec2974959 to your computer and use it in GitHub Desktop.
Save ASRagab/8e52e495896ec2974959 to your computer and use it in GitHub Desktop.
Apache Flink Local: Berkeley Data and the Simpson Paradox
import org.apache.flink.api.scala._
object BerkeleyData {
def main(args: Array[String]) {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val path = "/path/to/Berkeley.csv"
case class Admission(status: String, gender: String, dept: String, number: Double = 0.0)
val data = env.readCsvFile[Admission](
filePath = path,
ignoreFirstLine = true
)
val counts = data
.map { r => (r.dept, r.gender, r.number)}
.groupBy(0,1)
.reduceGroup( t => t.foldLeft(("", "", 0.0)) {
(c, ad) => (ad._1, ad._2, ad._3 + c._3)
}).join(data)
.where(0,1)
.equalTo(2,1) {
(left, right) =>
(left._1, left._2, right.status, 100 * right.number / left._3)
}.collect()
.sortBy(f => (f._1, f._3, f._2))
counts.foreach(d => println(s"${d._1}, ${d._2}, ${d._3}, ${makeDoublePretty(d._4)}%"))
}
def makeDoublePretty(number: Double) : Double = {
BigDecimal(number).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble
}
}
//Berkeley.csv
//Admit,Gender,Dept,Freq
//Admitted,Male,A,512
//Rejected,Male,A,313
//Admitted,Female,A,89
//Rejected,Female,A,19
//Admitted,Male,B,353
//Rejected,Male,B,207
//Admitted,Female,B,17
//Rejected,Female,B,8
//Admitted,Male,C,120
//Rejected,Male,C,205
//Admitted,Female,C,202
//Rejected,Female,C,391
//Admitted,Male,D,138
//Rejected,Male,D,279
//Admitted,Female,D,131
//Rejected,Female,D,244
//Admitted,Male,E,53
//Rejected,Male,E,138
//Admitted,Female,E,94
//Rejected,Female,E,299
//Admitted,Male,F,22
//Rejected,Male,F,351
//Admitted,Female,F,24
//Rejected,Female,F,317
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment