lieuzhenghong/SimpleApp.scala

## SimpleApp.scala
object SimpleApp {
  def main(args: Array[String]) {
    import org.apache.spark.sql.SparkSession
    import org.apache.spark.sql.functions.explode
    val spark = SparkSession.builder.appName("TripAnalysis").getOrCreate()
    import spark.implicits._
    val results_path = "s3a://results/"
    val paths = "s3a://trips/*"
    val tripDF = spark.read.option("multiline", "true").json(paths)
	// "Explode" the data array into individual rows
    val linksDF = tripDF.select(explode($"data").as("data"))
    val linksDF2 = linksDF.select("data.dbResponse.linkID", "data.absVelocity")
    // create a temporary view using the DataFrame
    linksDF2.createOrReplaceTempView("times")
    /*
      root
      |-- linkID: string (nullable = true)
      |-- absVelocity: double (nullable = true)
    */
    val tDF = spark.sql("SELECT CAST(linkID as LONG), absVelocity from times
	WHERE linkID IS NOT NULL AND absVelocity IS NOT NULL")
    val groupedDS = tDF.groupBy("linkID")
    val avgsDS = groupedDS.agg(
      "linkID" -> "count",
      "absVelocity" -> "avg"
    ).sort($"linkID".asc)

    avgsDS.coalesce(1).write.
    option("header", "true").
    csv(results_path + "results_49998")
    spark.stop()
  }
}
	object SimpleApp {
	def main(args: Array[String]) {
	import org.apache.spark.sql.SparkSession
	import org.apache.spark.sql.functions.explode
	val spark = SparkSession.builder.appName("TripAnalysis").getOrCreate()
	import spark.implicits._
	val results_path = "s3a://results/"
	val paths = "s3a://trips/*"
	val tripDF = spark.read.option("multiline", "true").json(paths)
	// "Explode" the data array into individual rows
	val linksDF = tripDF.select(explode($"data").as("data"))
	val linksDF2 = linksDF.select("data.dbResponse.linkID", "data.absVelocity")
	// create a temporary view using the DataFrame
	linksDF2.createOrReplaceTempView("times")
	/*
	root
	\|-- linkID: string (nullable = true)
	\|-- absVelocity: double (nullable = true)
	*/
	val tDF = spark.sql("SELECT CAST(linkID as LONG), absVelocity from times
	WHERE linkID IS NOT NULL AND absVelocity IS NOT NULL")
	val groupedDS = tDF.groupBy("linkID")
	val avgsDS = groupedDS.agg(
	"linkID" -> "count",
	"absVelocity" -> "avg"
	).sort($"linkID".asc)

	avgsDS.coalesce(1).write.
	option("header", "true").
	csv(results_path + "results_49998")
	spark.stop()
	}
	}