Skip to content

Instantly share code, notes, and snippets.

@mrandrewandrade
Created December 5, 2017 17:37
Show Gist options
  • Save mrandrewandrade/78fed9f02119d5b55a50ca55a66f6297 to your computer and use it in GitHub Desktop.
Save mrandrewandrade/78fed9f02119d5b55a50ca55a66f6297 to your computer and use it in GitHub Desktop.
Code examples from Spark The Definitive Guide
dataset_path = "/path/to/Spark-The-Definitive-Guide/data/"
myRange = spark.range(1000).toDF("number")
divisBy2 = myRange.where("number % 2 = 0")
divisBy2.count()
flightData2015 = spark.read.option("inferSchema","true").option("header","true").csv(dataset_path + "flight-data/csv/2015-summary.csv")
flightData2015.take(3)
flightData2015.sort("count").explain()
flightData2015.sort("count").take(2)
flightData2015.createOrReplaceTempView("flight_data_2015")
sqlWay = spark.sql( """ SELECT DEST_COUNTRY_NAME, count(1) FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME """)
sqlWay.explain()
dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()
dataFrameWay.explain()
from pyspark.sql.functions import max
flightData2015.select(max("count")).take(1)
maxSql = spark.sql("""SELECT DEST_COUNTRY_NAME, sum(count) as destination_total FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME ORDER BY sum(count) DESC LIMIT 5 """)
maxSql.collect()
from pyspark.sql.functions import desc
flightData2015.groupBy("DEST_COUNTRY_NAME").sum("count").withColumnRenamed("sum(count)","destination_total").sort(desc("destination_total")).limit(5).collect()
DF1 = spark.read.format("csv").option("inferSchema", "true").option("header","true").load(dataset_path + "flight-data/csv/2015-summary.csv")
DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect()
DF3 = DF1.groupBy("ORIGIN_COUNTRY_NAME").count().collect()
DF4 = DF1.groupBy("count").count().collect()
staticDataFrame = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dataset_path + "retail-data/by-day/*.csv")
staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema
from pyspark.sql.functions import window, column, desc, col
staticDataFrame.selectExpr("CustomerId","(UnitPrice * Quantity) as total_cost" ,"InvoiceDate" ).groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day")).sum("total_cost").orderBy(desc("sum(total_cost)")).take(5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment