Skip to content

Instantly share code, notes, and snippets.

@najikadri
Created January 27, 2024 22:59
Show Gist options
  • Save najikadri/6504c48eedc396f0c32ae9c30760aa5f to your computer and use it in GitHub Desktop.
Save najikadri/6504c48eedc396f0c32ae9c30760aa5f to your computer and use it in GitHub Desktop.
Basic Example of Apache Spark application using Datasets
package le.pioneer
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.log4j.Logger
object HelloDataSet extends Serializable {
@transient private lazy val logger: Logger = Logger.getLogger(getClass.getName)
private case class Employee(employeeID: Int, employeeName: String, department: String, startDate: String)
def main(args: Array[String]): Unit = {
if (args.length == 0) {
logger.error("Usage: Test filename.csv")
System.exit(1)
}
val spark: SparkSession = SparkSession.builder()
.appName("Hello DataSet")
.master("local[3]")
.getOrCreate()
import spark.implicits._
val employeeDS: Dataset[Employee] = spark.read
.option("header", true)
.option("inferSchema", true)
.csv(args(0)).as[Employee]
val da_employees = employeeDS.filter(e => e.department == "data analytics")
da_employees.show()
spark.stop()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment