Created
January 27, 2024 22:59
-
-
Save najikadri/6504c48eedc396f0c32ae9c30760aa5f to your computer and use it in GitHub Desktop.
Basic Example of Apache Spark application using Datasets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package le.pioneer | |
import org.apache.spark.sql.{Dataset, SparkSession} | |
import org.apache.log4j.Logger | |
object HelloDataSet extends Serializable { | |
@transient private lazy val logger: Logger = Logger.getLogger(getClass.getName) | |
private case class Employee(employeeID: Int, employeeName: String, department: String, startDate: String) | |
def main(args: Array[String]): Unit = { | |
if (args.length == 0) { | |
logger.error("Usage: Test filename.csv") | |
System.exit(1) | |
} | |
val spark: SparkSession = SparkSession.builder() | |
.appName("Hello DataSet") | |
.master("local[3]") | |
.getOrCreate() | |
import spark.implicits._ | |
val employeeDS: Dataset[Employee] = spark.read | |
.option("header", true) | |
.option("inferSchema", true) | |
.csv(args(0)).as[Employee] | |
val da_employees = employeeDS.filter(e => e.department == "data analytics") | |
da_employees.show() | |
spark.stop() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment