Skip to content

Instantly share code, notes, and snippets.

@najikadri
Created January 27, 2024 22:45
Show Gist options
  • Save najikadri/7479e730fc7e88f33f218128a876b4e4 to your computer and use it in GitHub Desktop.
Save najikadri/7479e730fc7e88f33f218128a876b4e4 to your computer and use it in GitHub Desktop.
Basic Example of Apache Spark application using RDDs
package le.pioneer
import org.apache.log4j.Logger
import org.apache.spark.{SparkConf, SparkContext}
// hard-coded Spark application using RDDs API instead of DataFrames
object HelloRDD extends Serializable {
private case class Employee(employeeID: Int, employeeName: String, department: String, startDate: String)
def main(args: Array[String]): Unit = {
val data_path = "data\\employees_rdd.csv"
val sparkAppConf = new SparkConf().setAppName("Hello RDD").setMaster("local[3]")
val sparkContext = new SparkContext(sparkAppConf)
val linesRDD = sparkContext.textFile(data_path)
val partitionRDD = linesRDD.repartition(2)
val colsRDD = partitionRDD.map(line => line.split(",").map(_.trim))
val employeeRDD = colsRDD.map(cols => Employee(cols(0).toInt, cols(1), cols(2), cols(3)))
val depGroupRdd = employeeRDD.groupBy(e => e.department)
val depCountRdd = depGroupRdd.map{
case (d, emp) => (d, emp.size)
}
println(depCountRdd.collect().mkString("\n"))
sparkContext.stop()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment