Created
January 27, 2024 22:45
-
-
Save najikadri/7479e730fc7e88f33f218128a876b4e4 to your computer and use it in GitHub Desktop.
Basic Example of Apache Spark application using RDDs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package le.pioneer | |
import org.apache.log4j.Logger | |
import org.apache.spark.{SparkConf, SparkContext} | |
// hard-coded Spark application using RDDs API instead of DataFrames | |
object HelloRDD extends Serializable { | |
private case class Employee(employeeID: Int, employeeName: String, department: String, startDate: String) | |
def main(args: Array[String]): Unit = { | |
val data_path = "data\\employees_rdd.csv" | |
val sparkAppConf = new SparkConf().setAppName("Hello RDD").setMaster("local[3]") | |
val sparkContext = new SparkContext(sparkAppConf) | |
val linesRDD = sparkContext.textFile(data_path) | |
val partitionRDD = linesRDD.repartition(2) | |
val colsRDD = partitionRDD.map(line => line.split(",").map(_.trim)) | |
val employeeRDD = colsRDD.map(cols => Employee(cols(0).toInt, cols(1), cols(2), cols(3))) | |
val depGroupRdd = employeeRDD.groupBy(e => e.department) | |
val depCountRdd = depGroupRdd.map{ | |
case (d, emp) => (d, emp.size) | |
} | |
println(depCountRdd.collect().mkString("\n")) | |
sparkContext.stop() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment