aialenti

## left_anti.py
#   Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
sellers_table = spark.read.parquet("./data/sellers_parquet")

'''
SELECT *
FROM sales_table
WHERE seller_id NOT IN (SELECT seller_id FROM sellers_table)
'''
#   Left Anti joins are a way to express the NOT IN operation in SQL

## left_semi.py
#   Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
sellers_table = spark.read.parquet("./data/sellers_parquet")


'''
SELECT *
FROM sales_table
WHERE seller_id IN (SELECT seller_id FROM sellers_table)
'''

## joins.py
#   Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
sellers_table = spark.read.parquet("./data/sellers_parquet")

'''
SELECT a.*,
       b.*
FROM sales_table a
    LEFT JOIN sellers_table b
        ON a.seller_id = b.seller_id

## union.py
#   Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")

'''
CREATE TABLE part_1 AS
SELECT *
FROM sales_table
WHERE num_pieces_sold > 50;

CREATE TABLE part_2 AS

## casewhen.py
#   Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")

'''
SELECT seller_id,
       CASE WHEN num_pieces_sold < 30 THEN 'Lower than 30',
            WHEN num_pieces_sold < 60 THEN 'Between 31 and 60'
            WHEN num_pieces_sold < 90 THEN 'Between 61 and 90'
            ELSE 'More than 91' AS sales_bucket
FROM sales_table

## distinct.py
#   Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")

'''
SELECT DISTINCT seller_id,
       date
FROM sales_table
'''
sales_table_execution_plan = sales_table.select(
    col("seller_id"), col("date")

## with_column.py
#   Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")

'''
SELECT order_id,
       product_id,
       seller_id,
       date,
       num_pieces_sold,
       bill_raw_text,

## rename.py
#   Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")

'''
SELECT order_id,
       product_id,
       seller_id,
       date,
       num_pieces_sold AS pieces,
       bill_raw_text

## explain.py
#   Print Schema
sales_table_execution_plan.printSchema()

## gist:84b2ede19f092aeb00db802a5a7ce09b
#   Explain the Execution Plan of the previous script
sales_table_execution_plan.explain()
	# Read the source tables in Parquet format
	sales_table = spark.read.parquet("./data/sales_parquet")
	sellers_table = spark.read.parquet("./data/sellers_parquet")

	'''
	SELECT *
	FROM sales_table
	WHERE seller_id NOT IN (SELECT seller_id FROM sellers_table)
	'''
	# Left Anti joins are a way to express the NOT IN operation in SQL
	# Read the source tables in Parquet format
	sales_table = spark.read.parquet("./data/sales_parquet")

	'''
	CREATE TABLE part_1 AS
	SELECT *
	FROM sales_table
	WHERE num_pieces_sold > 50;

	CREATE TABLE part_2 AS
	# Read the source tables in Parquet format
	sales_table = spark.read.parquet("./data/sales_parquet")

	'''
	SELECT seller_id,
	CASE WHEN num_pieces_sold < 30 THEN 'Lower than 30',
	WHEN num_pieces_sold < 60 THEN 'Between 31 and 60'
	WHEN num_pieces_sold < 90 THEN 'Between 61 and 90'
	ELSE 'More than 91' AS sales_bucket
	FROM sales_table
	# Read the source tables in Parquet format
	sales_table = spark.read.parquet("./data/sales_parquet")

	'''
	SELECT DISTINCT seller_id,
	date
	FROM sales_table
	'''
	sales_table_execution_plan = sales_table.select(
	col("seller_id"), col("date")
	# Read the source tables in Parquet format
	sales_table = spark.read.parquet("./data/sales_parquet")

	'''
	SELECT order_id,
	product_id,
	seller_id,
	date,
	num_pieces_sold,
	bill_raw_text,
	# Explain the Execution Plan of the previous script
	sales_table_execution_plan.explain()