mrandrewandrade/spark-the-definitive-guide.py

## spark-the-definitive-guide.py
dataset_path = "/path/to/Spark-The-Definitive-Guide/data/"

myRange = spark.range(1000).toDF("number")

divisBy2 = myRange.where("number % 2 = 0")

divisBy2.count()

flightData2015 = spark.read.option("inferSchema","true").option("header","true").csv(dataset_path + "flight-data/csv/2015-summary.csv")

flightData2015.take(3)

flightData2015.sort("count").explain()

flightData2015.sort("count").take(2)

flightData2015.createOrReplaceTempView("flight_data_2015")

sqlWay = spark.sql( """ SELECT DEST_COUNTRY_NAME, count(1) FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME """)
sqlWay.explain()

dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()
dataFrameWay.explain()

from pyspark.sql.functions import max
flightData2015.select(max("count")).take(1)

maxSql = spark.sql("""SELECT DEST_COUNTRY_NAME, sum(count) as destination_total FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME ORDER BY sum(count) DESC LIMIT 5 """)

maxSql.collect()

from pyspark.sql.functions import desc

flightData2015.groupBy("DEST_COUNTRY_NAME").sum("count").withColumnRenamed("sum(count)","destination_total").sort(desc("destination_total")).limit(5).collect()

DF1 = spark.read.format("csv").option("inferSchema", "true").option("header","true").load(dataset_path + "flight-data/csv/2015-summary.csv")

DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect()

DF3 = DF1.groupBy("ORIGIN_COUNTRY_NAME").count().collect()

DF4 = DF1.groupBy("count").count().collect()

staticDataFrame = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dataset_path + "retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")

staticSchema = staticDataFrame.schema

from pyspark.sql.functions import window, column, desc, col
staticDataFrame.selectExpr("CustomerId","(UnitPrice * Quantity) as total_cost" ,"InvoiceDate" ).groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day")).sum("total_cost").orderBy(desc("sum(total_cost)")).take(5)
	dataset_path = "/path/to/Spark-The-Definitive-Guide/data/"

	myRange = spark.range(1000).toDF("number")

	divisBy2 = myRange.where("number % 2 = 0")

	divisBy2.count()

	flightData2015 = spark.read.option("inferSchema","true").option("header","true").csv(dataset_path + "flight-data/csv/2015-summary.csv")

	flightData2015.take(3)

	flightData2015.sort("count").explain()

	flightData2015.sort("count").take(2)

	flightData2015.createOrReplaceTempView("flight_data_2015")

	sqlWay = spark.sql( """ SELECT DEST_COUNTRY_NAME, count(1) FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME """)
	sqlWay.explain()

	dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()
	dataFrameWay.explain()

	from pyspark.sql.functions import max
	flightData2015.select(max("count")).take(1)

	maxSql = spark.sql("""SELECT DEST_COUNTRY_NAME, sum(count) as destination_total FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME ORDER BY sum(count) DESC LIMIT 5 """)

	maxSql.collect()

	from pyspark.sql.functions import desc

	flightData2015.groupBy("DEST_COUNTRY_NAME").sum("count").withColumnRenamed("sum(count)","destination_total").sort(desc("destination_total")).limit(5).collect()

	DF1 = spark.read.format("csv").option("inferSchema", "true").option("header","true").load(dataset_path + "flight-data/csv/2015-summary.csv")

	DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect()

	DF3 = DF1.groupBy("ORIGIN_COUNTRY_NAME").count().collect()

	DF4 = DF1.groupBy("count").count().collect()

	staticDataFrame = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dataset_path + "retail-data/by-day/*.csv")

	staticDataFrame.createOrReplaceTempView("retail_data")

	staticSchema = staticDataFrame.schema

	from pyspark.sql.functions import window, column, desc, col
	staticDataFrame.selectExpr("CustomerId","(UnitPrice * Quantity) as total_cost" ,"InvoiceDate" ).groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day")).sum("total_cost").orderBy(desc("sum(total_cost)")).take(5)