Skip to content

Instantly share code, notes, and snippets.

# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
sellers_table = spark.read.parquet("./data/sellers_parquet")
'''
SELECT *
FROM sales_table
WHERE seller_id NOT IN (SELECT seller_id FROM sellers_table)
'''
# Left Anti joins are a way to express the NOT IN operation in SQL
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
sellers_table = spark.read.parquet("./data/sellers_parquet")
'''
SELECT *
FROM sales_table
WHERE seller_id IN (SELECT seller_id FROM sellers_table)
'''
@aialenti
aialenti / joins.py
Last active September 13, 2020 21:32
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
sellers_table = spark.read.parquet("./data/sellers_parquet")
'''
SELECT a.*,
b.*
FROM sales_table a
LEFT JOIN sellers_table b
ON a.seller_id = b.seller_id
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
CREATE TABLE part_1 AS
SELECT *
FROM sales_table
WHERE num_pieces_sold > 50;
CREATE TABLE part_2 AS
@aialenti
aialenti / casewhen.py
Last active September 13, 2020 15:19
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT seller_id,
CASE WHEN num_pieces_sold < 30 THEN 'Lower than 30',
WHEN num_pieces_sold < 60 THEN 'Between 31 and 60'
WHEN num_pieces_sold < 90 THEN 'Between 61 and 90'
ELSE 'More than 91' AS sales_bucket
FROM sales_table
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT DISTINCT seller_id,
date
FROM sales_table
'''
sales_table_execution_plan = sales_table.select(
col("seller_id"), col("date")
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT order_id,
product_id,
seller_id,
date,
num_pieces_sold,
bill_raw_text,
@aialenti
aialenti / rename.py
Last active September 13, 2020 14:50
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT order_id,
product_id,
seller_id,
date,
num_pieces_sold AS pieces,
bill_raw_text
# Print Schema
sales_table_execution_plan.printSchema()
# Explain the Execution Plan of the previous script
sales_table_execution_plan.explain()