Skip to content

Instantly share code, notes, and snippets.

@aialenti
Last active February 17, 2021 14:18
Show Gist options
  • Save aialenti/9bd4531e66ff58b0f4c6ad883381379a to your computer and use it in GitHub Desktop.
Save aialenti/9bd4531e66ff58b0f4c6ad883381379a to your computer and use it in GitHub Desktop.
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
CREATE TABLE sales_table_aggregated AS
SELECT COLLECT_SET(num_pieces_sold) AS num_pieces_sold_set,
seller_id
FROM sales_table
GROUP BY seller_id;
SELECT EXPLODE(num_pieces_sold_set) AS exploded_num_pieces_set
FROM sales_table_aggregated;
'''
sales_table_execution_aggregated = sales_table.groupBy(col("seller_id")).agg(
collect_set(col("num_pieces_sold")).alias("num_pieces_sold_set")
)
sales_table_execution_exploded = sales_table_execution_aggregated.select(
explode(col("num_pieces_sold_set")).alias("exploded_num_pieces_set")
)
sales_table_execution_exploded.show(10, True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment