Skip to content

Instantly share code, notes, and snippets.

View ZatolokinIvan's full-sized avatar

ZatolokinIvan

View GitHub Profile
mport org.apache.spark.sql.{DataFrame, SaveMode, functions}
import org.apache.spark.sql.types.StructType
object RDD extends SparkSessionWrapper {
import spark.implicits._
val df1 = spark.read.format("csv").option("header", "true").load("/home/rudi/IdeaProjects/RDD/src/main/resources/data/taxi_zones.csv").createOrReplaceTempView("DICT")
val df2 = spark.read.parquet("/home/rudi/IdeaProjects/RDD/src/main/resources/data/yellow_taxi_jan_25_2018").createOrReplaceTempView("TABL")
spark.sql("""SELECT Borough, count(*) as count FROM (select * from TABL t, DICT d where t.PULocationID == d.LocationID) GROUP BY Borough""").write.mode(SaveMode.Overwrite).option("delimiter","\t").format("text").save("/home/rudi/IdeaProjects/RDD/src/main/resources/data/taxi_zones_res.txt")
spark.read.format("text").load("/home/rudi/IdeaProjects/RDD/src/main/resources/data/taxi_zones_res.txt").show()
}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType
object RDD extends SparkSessionWrapper {
import spark.implicits._
val df1: DataFrame = spark.read.format("csv").option("header", "true").load("/home/rudi/IdeaProjects/RDD/src/main/resources/data/taxi_zones.csv")
val df3 = df1.withColumn("LocationID", df1("LocationID").cast("int"))
val df2: DataFrame= spark.read.parquet("/home/rudi/IdeaProjects/RDD/src/main/resources/data/yellow_taxi_jan_25_2018")
df2.join(df3,df2("PULocationID")===df3("LocationID"),"inner").toDF.groupBy("Borough").sum("RatecodeID").sort().write.parquet("/home/rudi/IdeaProjects/RDD/src/main/resources/data/taxi_zones1")
spark.read.format("parquet").option("header", "true").load("/home/rudi/IdeaProjects/RDD/src/main/resources/data/taxi_zones1").show()
}