mattf/data.py

## data.py
from pyspark.sql.types import *
from pyspark.sql.functions import mean

with open("data.csv", "w") as fp:
    fp.write("""
a,
b,1
c,
d,1
""")

# data is a CSV file w/ two fields, a string and a double
schema = StructType([StructField("a", StringType()), StructField("b", DoubleType(), True)])

df = spark.read.format("csv").schema(schema).load("data.csv")

# the data has some nulls, filter them out
df = df.na.drop()

df.select(mean(df['b'])).explain()
#== Physical Plan ==
#*(2) HashAggregate(keys=[], functions=[avg(b#1)])
#+- Exchange SinglePartition, true, [id=#142]
#   +- *(1) HashAggregate(keys=[], functions=[partial_avg(b#1)])
#      +- GpuColumnarToRow false
#         +- GpuProject [b#1]
#            +- GpuCoalesceBatches TargetSize(2147483647)
#               +- GpuFilter GpuAtLeastNNulls(n, a#0,b#1)
#                  +- GpuFileGpuScan csv [a#0,b#1] Batched: true, DataFilters: [AtLeastNNulls(n, a#0,b#1)], Format: CSV, Location: InMemoryFileIndex[file:/tmp/spark-3808f7c3-eb64-4ecc-b699-1ae8f0266d04/userFiles-29db7dd7-d72d-47..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<a:string,b:double>

df.select(mean(df['b'])).show()
#+------------------+
#|            avg(b)|
#+------------------+
#|0.6666666666666666|
#+------------------+
	from pyspark.sql.types import *
	from pyspark.sql.functions import mean

	with open("data.csv", "w") as fp:
	fp.write("""
	a,
	b,1
	c,
	d,1
	""")

	# data is a CSV file w/ two fields, a string and a double
	schema = StructType([StructField("a", StringType()), StructField("b", DoubleType(), True)])

	df = spark.read.format("csv").schema(schema).load("data.csv")

	# the data has some nulls, filter them out
	df = df.na.drop()

	df.select(mean(df['b'])).explain()
	#== Physical Plan ==
	#*(2) HashAggregate(keys=[], functions=[avg(b#1)])
	#+- Exchange SinglePartition, true, [id=#142]
	# +- *(1) HashAggregate(keys=[], functions=[partial_avg(b#1)])
	# +- GpuColumnarToRow false
	# +- GpuProject [b#1]
	# +- GpuCoalesceBatches TargetSize(2147483647)
	# +- GpuFilter GpuAtLeastNNulls(n, a#0,b#1)
	# +- GpuFileGpuScan csv [a#0,b#1] Batched: true, DataFilters: [AtLeastNNulls(n, a#0,b#1)], Format: CSV, Location: InMemoryFileIndex[file:/tmp/spark-3808f7c3-eb64-4ecc-b699-1ae8f0266d04/userFiles-29db7dd7-d72d-47..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<a:string,b:double>

	df.select(mean(df['b'])).show()
	#+------------------+
	#\| avg(b)\|
	#+------------------+
	#\|0.6666666666666666\|
	#+------------------+