Skip to content

Instantly share code, notes, and snippets.

@tecmaverick
Created December 19, 2022 09:25
Show Gist options
  • Save tecmaverick/389a36a5b26e94866a782032f3a439ea to your computer and use it in GitHub Desktop.
Save tecmaverick/389a36a5b26e94866a782032f3a439ea to your computer and use it in GitHub Desktop.
//Input Data
// studentid,coursename_with_attendance
// 01,CHEM:12|PHY:33|MATH:22
// 02,CHEM:34|PHY:3
// 03,MATH:12|COMP:45|CHEM:12
// 04,MATH:67|PHY:76
// 05,HIST:88|MARKT:33|BIOL:55
// 06,BIOL:88|PHY:77
// 07,BOTONY:34|ZOOL:77
// 08,BOTONY:34|COMP:99
// 09,HIST:36|COMP:66
// 010,MATH:44|COMP:32|STAT:23
// 011,COMP:44|STAT:66
//Output Data
// +----------+--------------------+----------------+
// |courseName|attendanceHoursTotal|studentsEnrolled|
// +----------+--------------------+----------------+
// | CHEM| 58| 3|
// | MATH| 145| 4|
// | COMP| 286| 5|
// | HIST| 124| 2|
// | BIOL| 143| 2|
// | PHY| 189| 4|
// | MARKT| 33| 1|
// | BOTONY| 68| 2|
// | ZOOL| 77| 1|
// | STAT| 89| 2|
// +----------+--------------------+----------------+
def main(args: Array[String]): Unit = {
val filePath = "file:///Users/abe/Personal/Apache Spark/Scripts/AjpRDDReverseMapping/data/data1.csv"
val spark = SparkSession.builder()
.master("local[3]")
.appName("Data Processor")
.config("spark.sql.shuffle.partitions", 2)
.getOrCreate()
val csv_rdd = spark.sparkContext.textFile(filePath)
val header = csv_rdd.first()
val csv_filtered_rdd = csv_rdd
.filter(x => x != header)
.map { row =>
val fields = row.split(",")
val student_id = fields(0)
val courses_with_attendance = fields(1).split("\\|")
(student_id, courses_with_attendance)
}
.flatMapValues(x=>x)
.map{
x=>
val course_attendance = x._2.split(":")
// StudentID, CourseName, CourseAttendanceHours
(x._1,course_attendance(0),course_attendance(1).toInt)
}
val df = spark.createDataFrame(csv_filtered_rdd)
.toDF("studentId","courseName","attendanceHours")
.groupBy("courseName")
.agg(sum("attendanceHours").alias("attendanceHoursTotal"),
count("studentId").alias("studentsEnrolled"))
.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment