aialenti/window_functions.py

## window_functions.py
#   Import Window
from pyspark.sql.window import Window

#   Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")

'''
SELECT seller_id,
       product_id,
       total_pieces,
       dense_rank() OVER (PARTITION BY seller_id ORDER BY total_pieces DESC) as rank
FROM (
    SELECT seller_id,
           product_id,
           SUM(total_pieces_sold) AS total_pieces
    FROM sales_table
    GROUP BY seller_id,
           product_id
)
'''

sales_table_agg = sales_table.groupBy(col("seller_id"), col("product_id")).agg(sum("num_pieces_sold").alias("total_pieces"))

#   Define the Window: partition the table on the seller ID and sort
#   each group according to the total pieces sold
window_specifications = Window.partitionBy(col("seller_id")).orderBy(col("total_pieces").asc())

#   Apply the dense_rank function, creating the window according to the specs above
sales_table_agg.withColumn('dense_rank', dense_rank().over(window_specifications)).show()
	# Import Window
	from pyspark.sql.window import Window

	# Read the source tables in Parquet format
	sales_table = spark.read.parquet("./data/sales_parquet")

	'''
	SELECT seller_id,
	product_id,
	total_pieces,
	dense_rank() OVER (PARTITION BY seller_id ORDER BY total_pieces DESC) as rank
	FROM (
	SELECT seller_id,
	product_id,
	SUM(total_pieces_sold) AS total_pieces
	FROM sales_table
	GROUP BY seller_id,
	product_id
	)
	'''

	sales_table_agg = sales_table.groupBy(col("seller_id"), col("product_id")).agg(sum("num_pieces_sold").alias("total_pieces"))

	# Define the Window: partition the table on the seller ID and sort
	# each group according to the total pieces sold
	window_specifications = Window.partitionBy(col("seller_id")).orderBy(col("total_pieces").asc())

	# Apply the dense_rank function, creating the window according to the specs above
	sales_table_agg.withColumn('dense_rank', dense_rank().over(window_specifications)).show()