Skip to content

Instantly share code, notes, and snippets.

@aialenti
Created September 13, 2020 22:04
Show Gist options
  • Save aialenti/8d252bb6a5cca7728d5dafcceb501ab8 to your computer and use it in GitHub Desktop.
Save aialenti/8d252bb6a5cca7728d5dafcceb501ab8 to your computer and use it in GitHub Desktop.
# Import Window
from pyspark.sql.window import Window
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT seller_id,
product_id,
total_pieces,
dense_rank() OVER (PARTITION BY seller_id ORDER BY total_pieces DESC) as rank
FROM (
SELECT seller_id,
product_id,
SUM(total_pieces_sold) AS total_pieces
FROM sales_table
GROUP BY seller_id,
product_id
)
'''
sales_table_agg = sales_table.groupBy(col("seller_id"), col("product_id")).agg(sum("num_pieces_sold").alias("total_pieces"))
# Define the Window: partition the table on the seller ID and sort
# each group according to the total pieces sold
window_specifications = Window.partitionBy(col("seller_id")).orderBy(col("total_pieces").asc())
# Apply the dense_rank function, creating the window according to the specs above
sales_table_agg.withColumn('dense_rank', dense_rank().over(window_specifications)).show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment